Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] nvme: enable FDP support
       [not found] <CGME20240528151007epcas5p32583675f647553923e5ba4987e9bc6ed@epcas5p3.samsung.com>
@ 2024-05-28 15:02 ` Kanchan Joshi
  2024-06-07 15:14   ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Kanchan Joshi @ 2024-05-28 15:02 UTC (permalink / raw)
  To: axboe, kbusch, hch, sagi
  Cc: linux-nvme, javier.gonz, bvanassche, gost.dev, Kanchan Joshi,
	Hui Qi, Nitesh Shetty

Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host
to control the placement of logical blocks so as to reduce the SSD WAF.

Userspace can send the data lifetime information using the write hints.
The SCSI driver (sd) can already pass this information to the SCSI
devices. This patch does the same for NVMe.

Fetches the placement-identifiers (plids) if the device supports FDP.
And map the incoming write-hints to plids.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Hui Qi <hui81.qi@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
Changes since v1:
- Reduce the fetched plids from 128 to 6 (Keith)
- Use struct_size for a calculation (Keith)
- Handle robot/sparse warning

 drivers/nvme/host/core.c | 67 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  4 +++
 include/linux/nvme.h     | 19 ++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 954f850f113a..9b67c3afe003 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -42,6 +42,20 @@ struct nvme_ns_info {
 	bool is_removed;
 };
 
+struct nvme_fdp_ruh_status_desc {
+	u16 pid;
+	u16 ruhid;
+	u32 earutr;
+	u64 ruamw;
+	u8  rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+	u8  rsvd0[14];
+	__le16 nruhsd;
+	struct nvme_fdp_ruh_status_desc ruhsd[];
+};
+
 unsigned int admin_timeout = 60;
 module_param(admin_timeout, uint, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@@ -922,6 +936,16 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 	return BLK_STS_OK;
 }
 
+static inline void nvme_assign_placement_id(struct nvme_ns *ns,
+					struct request *req,
+					struct nvme_command *cmd)
+{
+	enum rw_hint h = umin(ns->head->nr_plids - 1, req->write_hint);
+
+	cmd->rw.control |= cpu_to_le16(NVME_RW_DTYPE_DPLCMT);
+	cmd->rw.dsmgmt |= cpu_to_le32(ns->head->plids[h] << 16);
+}
+
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_opcode op)
@@ -1037,6 +1061,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
 		break;
 	case REQ_OP_WRITE:
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		if (!ret && ns->head->nr_plids)
+			nvme_assign_placement_id(ns, req, cmd);
 		break;
 	case REQ_OP_ZONE_APPEND:
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
@@ -2049,6 +2075,40 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
 	return ret;
 }
 
+static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
+{
+	struct nvme_command c = {};
+	struct nvme_fdp_ruh_status *ruhs;
+	struct nvme_fdp_ruh_status_desc *ruhsd;
+	int size, ret, i;
+
+	size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS);
+	ruhs = kzalloc(size, GFP_KERNEL);
+	if (!ruhs)
+		return -ENOMEM;
+
+	c.imr.opcode = nvme_cmd_io_mgmt_recv;
+	c.imr.nsid = cpu_to_le32(nsid);
+	c.imr.mo = 0x1;
+	c.imr.numd =  cpu_to_le32((size >> 2) - 1);
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+	if (ret)
+		goto out;
+
+	ns->head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+	ns->head->nr_plids =
+		min_t(u16, ns->head->nr_plids, NVME_MAX_PLIDS);
+
+	for (i = 0; i < ns->head->nr_plids; i++) {
+		ruhsd = &ruhs->ruhsd[i];
+		ns->head->plids[i] = le16_to_cpu(ruhsd->pid);
+	}
+out:
+	kfree(ruhs);
+	return ret;
+}
+
 static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		struct nvme_ns_info *info)
 {
@@ -2136,6 +2196,13 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		if (ret && !nvme_first_scan(ns->disk))
 			goto out;
 	}
+	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+		ret = nvme_fetch_fdp_plids(ns, info->nsid);
+		if (ret)
+			dev_warn(ns->ctrl->device,
+				"FDP failure status:0x%x\n", ret);
+	}
+
 
 	ret = 0;
 out:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cacc56f4bbf4..bec3024d6af9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -445,6 +445,8 @@ struct nvme_ns_ids {
 	u8	csi;
 };
 
+#define NVME_MAX_PLIDS   (WRITE_LIFE_EXTREME + 1)
+
 /*
  * Anchor structure for namespaces.  There is one for each namespace in a
  * NVMe subsystem that any of our controllers can see, and the namespace
@@ -462,6 +464,8 @@ struct nvme_ns_head {
 	bool			shared;
 	bool			passthru_err_log_enabled;
 	int			instance;
+	u16			nr_plids;
+	u16			plids[NVME_MAX_PLIDS];
 	struct nvme_effects_log *effects;
 	u64			nuse;
 	unsigned		ns_id;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 425573202295..fc07ba1b5ec5 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -270,6 +270,7 @@ enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
 	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 	NVME_CTRL_ATTR_ELBAS		= (1 << 15),
+	NVME_CTRL_ATTR_FDPS		= (1 << 19),
 };
 
 struct nvme_id_ctrl {
@@ -829,6 +830,7 @@ enum nvme_opcode {
 	nvme_cmd_resv_register	= 0x0d,
 	nvme_cmd_resv_report	= 0x0e,
 	nvme_cmd_resv_acquire	= 0x11,
+	nvme_cmd_io_mgmt_recv	= 0x12,
 	nvme_cmd_resv_release	= 0x15,
 	nvme_cmd_zone_mgmt_send	= 0x79,
 	nvme_cmd_zone_mgmt_recv	= 0x7a,
@@ -850,6 +852,7 @@ enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_resv_register),	\
 		nvme_opcode_name(nvme_cmd_resv_report),		\
 		nvme_opcode_name(nvme_cmd_resv_acquire),	\
+		nvme_opcode_name(nvme_cmd_io_mgmt_recv),	\
 		nvme_opcode_name(nvme_cmd_resv_release),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_send),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_recv),	\
@@ -1001,6 +1004,7 @@ enum {
 	NVME_RW_PRINFO_PRCHK_GUARD	= 1 << 12,
 	NVME_RW_PRINFO_PRACT		= 1 << 13,
 	NVME_RW_DTYPE_STREAMS		= 1 << 4,
+	NVME_RW_DTYPE_DPLCMT		= 2 << 4,
 	NVME_WZ_DEAC			= 1 << 9,
 };
 
@@ -1088,6 +1092,20 @@ struct nvme_zone_mgmt_recv_cmd {
 	__le32			cdw14[2];
 };
 
+struct nvme_io_mgmt_recv_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__u8			mo;
+	__u8			rsvd11;
+	__u16			mos;
+	__le32			numd;
+	__le32			cdw12[4];
+};
+
 enum {
 	NVME_ZRA_ZONE_REPORT		= 0,
 	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
@@ -1808,6 +1826,7 @@ struct nvme_command {
 		struct nvmf_auth_receive_command auth_receive;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
+		struct nvme_io_mgmt_recv_cmd imr;
 	};
 };
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-05-28 15:02 ` [PATCH v2] nvme: enable FDP support Kanchan Joshi
@ 2024-06-07 15:14   ` Keith Busch
  2024-06-08  5:17     ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2024-06-07 15:14 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, hch, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev,
	Hui Qi, Nitesh Shetty

On Tue, May 28, 2024 at 08:32:33PM +0530, Kanchan Joshi wrote:
> Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host
> to control the placement of logical blocks so as to reduce the SSD WAF.
> 
> Userspace can send the data lifetime information using the write hints.
> The SCSI driver (sd) can already pass this information to the SCSI
> devices. This patch does the same for NVMe.
> 
> Fetches the placement-identifiers (plids) if the device supports FDP.
> And map the incoming write-hints to plids.

Looks good to me. I'll queue this up for 6.11 after we can rebase to the
block 6.11 tree once that's created.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-07 15:14   ` Keith Busch
@ 2024-06-08  5:17     ` Christoph Hellwig
  2024-06-10 10:38       ` Kanchan Joshi
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2024-06-08  5:17 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, axboe, hch, sagi, linux-nvme, javier.gonz,
	bvanassche, gost.dev, Hui Qi, Nitesh Shetty

Nacked-by: Christoph Hellwig <hch@lst.de>

This is a really overcomplicated way that does not in any way fit the
FDP use case (which is also rather overcomplicated).

If you want to pass on life time information to a NVMe device please
work with the NVMe technical working group to add the equivalent of
the life time hints added to SCSI as part of the
constrained streams in SBC.  It would be implementable in the Linux
nvme driver in a few lines of code, similaly trivially in the device
and actually makes things work.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-08  5:17     ` Christoph Hellwig
@ 2024-06-10 10:38       ` Kanchan Joshi
  2024-06-10 11:27         ` Martin K. Petersen
  0 siblings, 1 reply; 12+ messages in thread
From: Kanchan Joshi @ 2024-06-10 10:38 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev,
	Hui Qi, Nitesh Shetty

On 6/8/2024 10:47 AM, Christoph Hellwig wrote:
> Nacked-by: Christoph Hellwig<hch@lst.de>
> 
> This is a really overcomplicated way that does not in any way fit the
> FDP use case (which is also rather overcomplicated).
> 
> If you want to pass on life time information to a NVMe device please
> work with the NVMe technical working group to add the equivalent of
> the life time hints added to SCSI as part of the
> constrained streams in SBC.  It would be implementable in the Linux
> nvme driver in a few lines of code, similaly trivially in the device
> and actually makes things work.

I disagree. Bart's patch using "constrained streams in SBC" adds 104 
lines [*]. This patch adds 90 lines.
Both do the conceptually similar two steps equally trivially:

Step #1. Query protocol specific hints.
- sd_read_io_hints (scsi): sends one/two commands to the device
- nvme_fetch_fdp_plids (nvme): sends one command to the device

Step #2. map write-hints to protocol specific hints.
- sd_group_number (scsi)
- nvme_assign_placement_id (nvme)

So current plumbing is nearly identical and as simple as SCSI.

And TP 4146 author list shows 10 companies. Perhaps good enough for 
diverse opinions on how to go about things before settling down.

We have used passthrough interface for FDP wherever possible. But there 
are users asking file/block interface for what has been standardized. We 
end up sharing this patch as a off-tree solution (which is a pity).

And this was discussed at LSFMM too; There was no push back to move forward.

[*] 
https://lore.kernel.org/linux-scsi/20240222214508.1630719-4-bvanassche@acm.org/


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-10 10:38       ` Kanchan Joshi
@ 2024-06-10 11:27         ` Martin K. Petersen
  2024-06-10 11:53           ` Javier González
  2024-06-10 11:55           ` [PATCH v2] " Christoph Hellwig
  0 siblings, 2 replies; 12+ messages in thread
From: Martin K. Petersen @ 2024-06-10 11:27 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, Keith Busch, axboe, sagi, linux-nvme,
	javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty


Hi Kanchan!

> So current plumbing is nearly identical and as simple as SCSI.

I don't have a problem with your implementation, however...

> And TP 4146 author list shows 10 companies. Perhaps good enough for 
> diverse opinions on how to go about things before settling down.

I think FDP and SCSI streams are less than ideal. 10+ years and the
standards bodies still haven't been able to produce an approach that
makes sense in the context of a general purpose operating system.

-- 
Martin K. Petersen	Oracle Linux Engineering


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: nvme: enable FDP support
  2024-06-10 11:27         ` Martin K. Petersen
@ 2024-06-10 11:53           ` Javier González
  2024-06-10 11:55           ` [PATCH v2] " Christoph Hellwig
  1 sibling, 0 replies; 12+ messages in thread
From: Javier González @ 2024-06-10 11:53 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch, axboe, sagi,
	linux-nvme, bvanassche, gost.dev, Hui Qi, Nitesh Shetty

On 10.06.2024 07:27, Martin K. Petersen wrote:
>
>Hi Kanchan!
>
>> So current plumbing is nearly identical and as simple as SCSI.
>
>I don't have a problem with your implementation, however...
>
>> And TP 4146 author list shows 10 companies. Perhaps good enough for
>> diverse opinions on how to go about things before settling down.
>
>I think FDP and SCSI streams are less than ideal. 10+ years and the
>standards bodies still haven't been able to produce an approach that
>makes sense in the context of a general purpose operating system.

I want to believe that in each iteration we are getting closer to
something that can actually be deployed without major OS and application
changes.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-10 11:27         ` Martin K. Petersen
  2024-06-10 11:53           ` Javier González
@ 2024-06-10 11:55           ` Christoph Hellwig
  2024-06-10 14:52             ` Keith Busch
  1 sibling, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2024-06-10 11:55 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch, axboe, sagi,
	linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi,
	Nitesh Shetty

On Mon, Jun 10, 2024 at 07:27:16AM -0400, Martin K. Petersen wrote:
> > And TP 4146 author list shows 10 companies. Perhaps good enough for 
> > diverse opinions on how to go about things before settling down.
> 
> I think FDP and SCSI streams are less than ideal. 10+ years and the
> standards bodies still haven't been able to produce an approach that
> makes sense in the context of a general purpose operating system.

The SCSI temperature hints are I think exactly what we want.  The fact
that they had to be shoe horned in in a weird to claim to be streams
is a little awkward, but sometimes that's need to get things done.

Hint to Samsung and Meta: if you have to hack them into NVMe to
still claim it's FDP so that you marketing budget isn't lost we
can probably live with that..



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-10 11:55           ` [PATCH v2] " Christoph Hellwig
@ 2024-06-10 14:52             ` Keith Busch
  2024-06-11  5:47               ` Christoph Hellwig
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2024-06-10 14:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme,
	javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty

On Mon, Jun 10, 2024 at 01:55:41PM +0200, Christoph Hellwig wrote:
> On Mon, Jun 10, 2024 at 07:27:16AM -0400, Martin K. Petersen wrote:
> > > And TP 4146 author list shows 10 companies. Perhaps good enough for 
> > > diverse opinions on how to go about things before settling down.
> > 
> > I think FDP and SCSI streams are less than ideal. 10+ years and the
> > standards bodies still haven't been able to produce an approach that
> > makes sense in the context of a general purpose operating system.
> 
> The SCSI temperature hints are I think exactly what we want.  The fact
> that they had to be shoe horned in in a weird to claim to be streams
> is a little awkward, but sometimes that's need to get things done.

I feel a bit out of the loop here, I'm not sure what the concern is.

I agree the FDP setup is complicated, but none of that is taken on by
the driver. It just discovers the capabilities and maps an arbitrary
software "hint" to an arbitrary device "hint". It's up to the
application to use those optimally; the driver just performs the
requested mapping.

Is it because the names of those hints indicate data lifetime? These are
just arbitrary numbers used by applications to separate placement. If
they were called HINT_A, HINT_B, HINT_C, would that make this ok?


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-10 14:52             ` Keith Busch
@ 2024-06-11  5:47               ` Christoph Hellwig
  2024-06-11 14:32                 ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2024-06-11  5:47 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Martin K. Petersen, Kanchan Joshi, axboe, sagi,
	linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi,
	Nitesh Shetty

On Mon, Jun 10, 2024 at 08:52:12AM -0600, Keith Busch wrote:
> I agree the FDP setup is complicated, but none of that is taken on by
> the driver. It just discovers the capabilities and maps an arbitrary
> software "hint" to an arbitrary device "hint". It's up to the
> application to use those optimally; the driver just performs the
> requested mapping.
> 
> Is it because the names of those hints indicate data lifetime? These are
> just arbitrary numbers used by applications to separate placement. If
> they were called HINT_A, HINT_B, HINT_C, would that make this ok?

No, the other problem is that FDP very much has an implicit contract
that the host actually aligns to it resources units, and actually
has a really complicated mangement.  It's not a simple throw a
lifetime hint at the drive.  Note that the implicit is indeed very
implicit - it is a really horrible spec with a lot of assumptions
but nothing actually enforcing it.  If you just use it for dumb
lifetime hints changes are that you actually increase write
amplificiation.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-11  5:47               ` Christoph Hellwig
@ 2024-06-11 14:32                 ` Keith Busch
  2024-06-11 19:43                   ` Martin K. Petersen
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2024-06-11 14:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme,
	javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty

On Tue, Jun 11, 2024 at 07:47:26AM +0200, Christoph Hellwig wrote:
> On Mon, Jun 10, 2024 at 08:52:12AM -0600, Keith Busch wrote:
> > I agree the FDP setup is complicated, but none of that is taken on by
> > the driver. It just discovers the capabilities and maps an arbitrary
> > software "hint" to an arbitrary device "hint". It's up to the
> > application to use those optimally; the driver just performs the
> > requested mapping.
> > 
> > Is it because the names of those hints indicate data lifetime? These are
> > just arbitrary numbers used by applications to separate placement. If
> > they were called HINT_A, HINT_B, HINT_C, would that make this ok?
> 
> No, the other problem is that FDP very much has an implicit contract
> that the host actually aligns to it resources units, and actually
> has a really complicated mangement.  It's not a simple throw a
> lifetime hint at the drive.  Note that the implicit is indeed very
> implicit - it is a really horrible spec with a lot of assumptions
> but nothing actually enforcing it.  If you just use it for dumb
> lifetime hints changes are that you actually increase write
> amplificiation.

NVMe has various features that recommend many things, but none of them
are enforced (see NOWS, NPDA, NPWA, etc...). We expect applications to
act in good faith, but nothing is enforced at the protocol level because
it is difficult to bring on useful software otherwise. You just won't
maximize benefits if you don't align, and FDP is no different.

The fact that setting up a device to use FDP is such a pain is a clear
indication of the user's intentions and responisibilities for using
it. Yeah, a degenerate application abusing FDP semantics is worse for
performance and device wear than doing nothing at all, but why should
anyone care about that?

One thing FDP got right was mandating the Endurance Log: the drive must
provide a feedback mechanism for the host to know if what they're doing
is helpful or harmful. If you're just blindly throwing random fcntl
hints, then you're not the target audience for the feature; you're
expected to iterate and tweak your usage.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-11 14:32                 ` Keith Busch
@ 2024-06-11 19:43                   ` Martin K. Petersen
  2024-06-11 22:42                     ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Martin K. Petersen @ 2024-06-11 19:43 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Martin K. Petersen, Kanchan Joshi, axboe, sagi,
	linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi,
	Nitesh Shetty


Hi Keith!

> One thing FDP got right was mandating the Endurance Log: the drive
> must provide a feedback mechanism for the host to know if what they're
> doing is helpful or harmful.

Good luck teaching firefox what to do with that information!

> If you're just blindly throwing random fcntl hints, then you're not
> the target audience for the feature; you're expected to iterate and
> tweak your usage.

And that's exactly my point. What the various attempts at data
management in the specs have in common is that they are unsuitable for a
general purpose operating system and its applications.

We can all come up with a restrictive model which works beautifully for
one particular application. No problem. But that's not what standards
are supposed to be about! We used to produce specifications which worked
for every type of application and device.

It was a beautiful thing when we went away from cylinders, heads, and
sectors as tools to do performance management on storage. An abstracted
model for managing blocks that has worked for everything from USB flash
drives, over spinning rust, to million dollar storage arrays. With one
protocol. For decades. And still going. Because the abstraction worked,
and it removed the burden of having to care about device implementation
artifacts from applications and operating systems alike.

We need a similar model for data management. Something which works well
enough on the device media management side but which transcends one
particular application or device implementation. I really don't believe
any of the currently defined data management schemes are timeless the
same way as LBAs have proven to be...

-- 
Martin K. Petersen	Oracle Linux Engineering


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] nvme: enable FDP support
  2024-06-11 19:43                   ` Martin K. Petersen
@ 2024-06-11 22:42                     ` Keith Busch
  0 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2024-06-11 22:42 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Christoph Hellwig, Kanchan Joshi, axboe, sagi, linux-nvme,
	javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty

On Tue, Jun 11, 2024 at 03:43:11PM -0400, Martin K. Petersen wrote:
d
> 
> Hi Keith!
> 
> > One thing FDP got right was mandating the Endurance Log: the drive
> > must provide a feedback mechanism for the host to know if what they're
> > doing is helpful or harmful.
> 
> Good luck teaching firefox what to do with that information!
> 
> > If you're just blindly throwing random fcntl hints, then you're not
> > the target audience for the feature; you're expected to iterate and
> > tweak your usage.
> 
> And that's exactly my point. What the various attempts at data
> management in the specs have in common is that they are unsuitable for a
> general purpose operating system and its applications.
> 
> We can all come up with a restrictive model which works beautifully for
> one particular application. No problem. But that's not what standards
> are supposed to be about! We used to produce specifications which worked
> for every type of application and device.

FDP isn't a user distro level feature. This is expert level, and is not
reachable by default; you really need admin know-how to make a namespace
that recognizes these semantics. The machines that can reach *this*
feature are most certainly headless servers, so consumer use cases
aren't considered yet! All in good time (maybe).

And as I don asbestos underwear and dare say, Linux enables enterprise
storage capabilities for filesystems not reachable for the average user
(DAX), so we (Linux) are not exactly fencing off difficult to use
features. FDP doesn't even require new kernel interface changes to wire
it up to filesystems, so there's no additional maintenance burden here.
From a pure block and NVMe maintenance point of view, this is nothing.

I do have gripes with the *kernel* interfaces, though. Mainly that it's
per inode which makes this useless with raw block IO. We started FDP
with the passthrough interface, and that proved usage produces
meaningful gains, so hooking this into existing data separation provided
by fnctl feels like a natural progression despite its limitations.

I think enabling such experimentation can only help make these
interfaces become better and enlighten future protocol changes.
 
> It was a beautiful thing when we went away from cylinders, heads, and
> sectors as tools to do performance management on storage. An abstracted
> model for managing blocks that has worked for everything from USB flash
> drives, over spinning rust, to million dollar storage arrays. With one
> protocol. For decades. And still going. Because the abstraction worked,
> and it removed the burden of having to care about device implementation
> artifacts from applications and operating systems alike.
> 
> We need a similar model for data management. Something which works well
> enough on the device media management side but which transcends one
> particular application or device implementation. I really don't believe
> any of the currently defined data management schemes are timeless the
> same way as LBAs have proven to be...

No disagreement here!

NVMe 1.0 defined Read/Write CDW13 DSM field with what I think are almost
the desired semantics. 12 years later, no one implemented it, but mark
my words: we'll circle back to something similiar in 12 more years.


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-06-11 22:42 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20240528151007epcas5p32583675f647553923e5ba4987e9bc6ed@epcas5p3.samsung.com>
2024-05-28 15:02 ` [PATCH v2] nvme: enable FDP support Kanchan Joshi
2024-06-07 15:14   ` Keith Busch
2024-06-08  5:17     ` Christoph Hellwig
2024-06-10 10:38       ` Kanchan Joshi
2024-06-10 11:27         ` Martin K. Petersen
2024-06-10 11:53           ` Javier González
2024-06-10 11:55           ` [PATCH v2] " Christoph Hellwig
2024-06-10 14:52             ` Keith Busch
2024-06-11  5:47               ` Christoph Hellwig
2024-06-11 14:32                 ` Keith Busch
2024-06-11 19:43                   ` Martin K. Petersen
2024-06-11 22:42                     ` Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox