Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] nvme: enable FDP support
       [not found] <CGME20240702103348epcas5p1015eafbddf4795558843cd74b0453b12@epcas5p1.samsung.com>
@ 2024-07-02 10:26 ` Kanchan Joshi
  2024-07-02 11:39   ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Kanchan Joshi @ 2024-07-02 10:26 UTC (permalink / raw)
  To: axboe, kbusch, hch, martin.petersen, sagi
  Cc: linux-nvme, bvanassche, javier.gonz, gost.dev, Kanchan Joshi,
	Hui Qi, Nitesh Shetty

Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host
to control the placement of logical blocks so as to reduce the SSD WAF.

Userspace can send the data lifetime information using the write hints.
The SCSI driver (sd) can already pass this information to the SCSI
devices. This patch does the same for NVMe.

Fetch the placement-identifiers if the device supports FDP.
The incoming write-hint is mapped to a placement-identifier, which in
turn is set in the DSPEC field of the write command.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Hui Qi <hui81.qi@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
Changes since v2:
- Base it on nvme-6.11 and resolve a merge conflict

Changes since v1:
- Reduce the fetched plids from 128 to 6 (Keith)
- Use struct_size for a calculation (Keith)
- Handle robot/sparse warning

 drivers/nvme/host/core.c | 67 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  4 +++
 include/linux/nvme.h     | 19 ++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 20e7505852ce..c7455e917e3b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -42,6 +42,20 @@ struct nvme_ns_info {
 	bool is_removed;
 };
 
+struct nvme_fdp_ruh_status_desc {
+	u16 pid;
+	u16 ruhid;
+	u32 earutr;
+	u64 ruamw;
+	u8  rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+	u8  rsvd0[14];
+	__le16 nruhsd;
+	struct nvme_fdp_ruh_status_desc ruhsd[];
+};
+
 unsigned int admin_timeout = 60;
 module_param(admin_timeout, uint, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@@ -957,6 +971,16 @@ static bool nvme_valid_atomic_write(struct request *req)
 	return true;
 }
 
+static inline void nvme_assign_placement_id(struct nvme_ns *ns,
+					struct request *req,
+					struct nvme_command *cmd)
+{
+	enum rw_hint h = umin(ns->head->nr_plids - 1, req->write_hint);
+
+	cmd->rw.control |= cpu_to_le16(NVME_RW_DTYPE_DPLCMT);
+	cmd->rw.dsmgmt |= cpu_to_le32(ns->head->plids[h] << 16);
+}
+
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_opcode op)
@@ -1075,6 +1099,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
 		break;
 	case REQ_OP_WRITE:
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		if (!ret && ns->head->nr_plids)
+			nvme_assign_placement_id(ns, req, cmd);
 		break;
 	case REQ_OP_ZONE_APPEND:
 		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
@@ -2105,6 +2131,40 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
 	return ret;
 }
 
+static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
+{
+	struct nvme_command c = {};
+	struct nvme_fdp_ruh_status *ruhs;
+	struct nvme_fdp_ruh_status_desc *ruhsd;
+	int size, ret, i;
+
+	size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS);
+	ruhs = kzalloc(size, GFP_KERNEL);
+	if (!ruhs)
+		return -ENOMEM;
+
+	c.imr.opcode = nvme_cmd_io_mgmt_recv;
+	c.imr.nsid = cpu_to_le32(nsid);
+	c.imr.mo = 0x1;
+	c.imr.numd =  cpu_to_le32((size >> 2) - 1);
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+	if (ret)
+		goto out;
+
+	ns->head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+	ns->head->nr_plids =
+		min_t(u16, ns->head->nr_plids, NVME_MAX_PLIDS);
+
+	for (i = 0; i < ns->head->nr_plids; i++) {
+		ruhsd = &ruhs->ruhsd[i];
+		ns->head->plids[i] = le16_to_cpu(ruhsd->pid);
+	}
+out:
+	kfree(ruhs);
+	return ret;
+}
+
 static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		struct nvme_ns_info *info)
 {
@@ -2196,6 +2256,13 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		if (ret && !nvme_first_scan(ns->disk))
 			goto out;
 	}
+	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+		ret = nvme_fetch_fdp_plids(ns, info->nsid);
+		if (ret)
+			dev_warn(ns->ctrl->device,
+				"FDP failure status:0x%x\n", ret);
+	}
+
 
 	ret = 0;
 out:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b512012b7044..7b88b8ae502e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -450,6 +450,8 @@ struct nvme_ns_ids {
 	u8	csi;
 };
 
+#define NVME_MAX_PLIDS   (WRITE_LIFE_EXTREME + 1)
+
 /*
  * Anchor structure for namespaces.  There is one for each namespace in a
  * NVMe subsystem that any of our controllers can see, and the namespace
@@ -466,6 +468,8 @@ struct nvme_ns_head {
 	struct kref		ref;
 	bool			shared;
 	bool			passthru_err_log_enabled;
+	u16			nr_plids;
+	u16			plids[NVME_MAX_PLIDS];
 	int			instance;
 	struct nvme_effects_log *effects;
 	u64			nuse;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 57e27e48c913..9effc5902901 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -273,6 +273,7 @@ enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
 	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 	NVME_CTRL_ATTR_ELBAS		= (1 << 15),
+	NVME_CTRL_ATTR_FDPS		= (1 << 19),
 };
 
 struct nvme_id_ctrl {
@@ -832,6 +833,7 @@ enum nvme_opcode {
 	nvme_cmd_resv_register	= 0x0d,
 	nvme_cmd_resv_report	= 0x0e,
 	nvme_cmd_resv_acquire	= 0x11,
+	nvme_cmd_io_mgmt_recv	= 0x12,
 	nvme_cmd_resv_release	= 0x15,
 	nvme_cmd_zone_mgmt_send	= 0x79,
 	nvme_cmd_zone_mgmt_recv	= 0x7a,
@@ -853,6 +855,7 @@ enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_resv_register),	\
 		nvme_opcode_name(nvme_cmd_resv_report),		\
 		nvme_opcode_name(nvme_cmd_resv_acquire),	\
+		nvme_opcode_name(nvme_cmd_io_mgmt_recv),	\
 		nvme_opcode_name(nvme_cmd_resv_release),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_send),	\
 		nvme_opcode_name(nvme_cmd_zone_mgmt_recv),	\
@@ -1004,6 +1007,7 @@ enum {
 	NVME_RW_PRINFO_PRCHK_GUARD	= 1 << 12,
 	NVME_RW_PRINFO_PRACT		= 1 << 13,
 	NVME_RW_DTYPE_STREAMS		= 1 << 4,
+	NVME_RW_DTYPE_DPLCMT		= 2 << 4,
 	NVME_WZ_DEAC			= 1 << 9,
 };
 
@@ -1091,6 +1095,20 @@ struct nvme_zone_mgmt_recv_cmd {
 	__le32			cdw14[2];
 };
 
+struct nvme_io_mgmt_recv_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__u8			mo;
+	__u8			rsvd11;
+	__u16			mos;
+	__le32			numd;
+	__le32			cdw12[4];
+};
+
 enum {
 	NVME_ZRA_ZONE_REPORT		= 0,
 	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
@@ -1811,6 +1829,7 @@ struct nvme_command {
 		struct nvmf_auth_receive_command auth_receive;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
+		struct nvme_io_mgmt_recv_cmd imr;
 	};
 };
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 10:26 ` [PATCH v3] nvme: enable FDP support Kanchan Joshi
@ 2024-07-02 11:39   ` Christoph Hellwig
  2024-07-02 15:15     ` Keith Busch
  2024-07-03  8:40     ` Kanchan Joshi
  0 siblings, 2 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-07-02 11:39 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, kbusch, hch, martin.petersen, sagi, linux-nvme, bvanassche,
	javier.gonz, gost.dev, Hui Qi, Nitesh Shetty

Same NAK as before.  FDP was intentionally designed to not fit the
clearly documented Linux needs.  If you want support for Linux data
temperature hints work with the NVMe technical working group to make
it happen.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 11:39   ` Christoph Hellwig
@ 2024-07-02 15:15     ` Keith Busch
  2024-07-02 15:36       ` Christoph Hellwig
  2024-07-03  8:40     ` Kanchan Joshi
  1 sibling, 1 reply; 8+ messages in thread
From: Keith Busch @ 2024-07-02 15:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, axboe, martin.petersen, sagi, linux-nvme,
	bvanassche, javier.gonz, gost.dev, Hui Qi, Nitesh Shetty

I like concensus on features, but I don't think we're going to get that
here. It's my opinion that FDP is just arbitrary hints, and this is just
one way to map it to other hints. Not a perfect match, but I don't think
it needs to be. It is not realistic for protocols to target a specific
operating system, and Linux's write hints are not exactly a shining
example of an interface for this purpose either.

My first concern for applying something like this is what kind of
maintenance burdens does this create, or any potential harm to users who
don't subscribe to the feature? Nothing here looks alarming to me, and
there is a clear demand to be able to access these features like this.
Maybe it works out for some workloads, maybe it doesn't, but I don't see
a need to block this at this point.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 15:15     ` Keith Busch
@ 2024-07-02 15:36       ` Christoph Hellwig
  2024-07-02 15:49         ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2024-07-02 15:36 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Kanchan Joshi, axboe, martin.petersen, sagi,
	linux-nvme, bvanassche, javier.gonz, gost.dev, Hui Qi,
	Nitesh Shetty

On Tue, Jul 02, 2024 at 09:15:06AM -0600, Keith Busch wrote:
> My first concern for applying something like this is what kind of
> maintenance burdens does this create, or any potential harm to users who
> don't subscribe to the feature? Nothing here looks alarming to me, and
> there is a clear demand to be able to access these features like this.
> Maybe it works out for some workloads, maybe it doesn't, but I don't see
> a need to block this at this point.

We're just abusing the interface, and giving how badly designed and
intentionally Linux-hostile it was I see no point.

If Samsung and Meta care enough about good Linux I/O temperature hint
support we'll get what we want in the technical working group, please
help working on that!


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 15:36       ` Christoph Hellwig
@ 2024-07-02 15:49         ` Jens Axboe
  2024-07-02 15:51           ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2024-07-02 15:49 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: Kanchan Joshi, martin.petersen, sagi, linux-nvme, bvanassche,
	javier.gonz, gost.dev, Hui Qi, Nitesh Shetty

On 7/2/24 9:36 AM, Christoph Hellwig wrote:
> On Tue, Jul 02, 2024 at 09:15:06AM -0600, Keith Busch wrote:
>> My first concern for applying something like this is what kind of
>> maintenance burdens does this create, or any potential harm to users who
>> don't subscribe to the feature? Nothing here looks alarming to me, and
>> there is a clear demand to be able to access these features like this.
>> Maybe it works out for some workloads, maybe it doesn't, but I don't see
>> a need to block this at this point.
> 
> We're just abusing the interface, and giving how badly designed and
> intentionally Linux-hostile it was I see no point.
> 
> If Samsung and Meta care enough about good Linux I/O temperature hint
> support we'll get what we want in the technical working group, please
> help working on that!

I'm with Keith on this one - there's no real maintenance burden to bear
for this feature, it's pretty well contained. There's no point gate
keeping it based on changing the spec, as that will take a long time to
get done. And there's no reason to hold this back until this happens
based on mostly ideological reasons, when there are actual users that
could use it now. What do we have to lose?

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 15:49         ` Jens Axboe
@ 2024-07-02 15:51           ` Christoph Hellwig
  0 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-07-02 15:51 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Keith Busch, Kanchan Joshi, martin.petersen,
	sagi, linux-nvme, bvanassche, javier.gonz, gost.dev, Hui Qi,
	Nitesh Shetty

On Tue, Jul 02, 2024 at 09:49:35AM -0600, Jens Axboe wrote:
> I'm with Keith on this one - there's no real maintenance burden to bear
> for this feature, it's pretty well contained. There's no point gate
> keeping it based on changing the spec, as that will take a long time to
> get done. And there's no reason to hold this back until this happens
> based on mostly ideological reasons, when there are actual users that
> could use it now. What do we have to lose?

What we do is a misuse of the spec intentionally designed to not
accomodate us.  I'm dead-set against this.  Please all help getting
the proper interface in.  I worked with folks on the SCSI side and
we finally got it (although with a few warts).

> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-02 11:39   ` Christoph Hellwig
  2024-07-02 15:15     ` Keith Busch
@ 2024-07-03  8:40     ` Kanchan Joshi
  2024-07-03 15:08       ` Christoph Hellwig
  1 sibling, 1 reply; 8+ messages in thread
From: Kanchan Joshi @ 2024-07-03  8:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, kbusch, martin.petersen, sagi, linux-nvme, bvanassche,
	javier.gonz, gost.dev, Hui Qi, Nitesh Shetty

On 7/2/2024 5:09 PM, Christoph Hellwig wrote:
> Same NAK as before.  FDP was intentionally designed to not fit the
> clearly documented Linux needs.

That was not the design goal of FDP. It is ludicrous for any storage 
technology to have that as an intentional design choice.

> If you want support for Linux data
> temperature hints work with the NVMe technical working group to make
> it happen.

The value does not come from the temperature-sensitive names that we 
have in Linux. It rather comes from keeping the WAF low and from using 
Linux file-sytems. The write-hint interface, despite not being ideal, 
happens to be the only interface we got for the latter.

If/when anything different grows in the device-side, we can trivially 
change the plumbing this patch does.

People need support for what we have in current spec and more 
importantly in products. It's not a Meta/Samsung only thing. Many 
hypsercalers and enterprise customers need this.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] nvme: enable FDP support
  2024-07-03  8:40     ` Kanchan Joshi
@ 2024-07-03 15:08       ` Christoph Hellwig
  0 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-07-03 15:08 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, axboe, kbusch, martin.petersen, sagi,
	linux-nvme, bvanassche, javier.gonz, gost.dev, Hui Qi,
	Nitesh Shetty

On Wed, Jul 03, 2024 at 02:10:24PM +0530, Kanchan Joshi wrote:
> That was not the design goal of FDP. It is ludicrous for any storage 
> technology to have that as an intentional design choice.

I think you missed out on the development dramas.  The Meta
representative for what became FDP absoutely rejected any proposal to
make it useful just for that reason, and anyone outside a small
circle got explicitly excluded from the development for just this
reasons.  This is probably as much as I can say without getting in
trouble here, but I'm happy to provide more details in private.

> People need support for what we have in current spec and more 
> importantly in products. It's not a Meta/Samsung only thing. Many 
> hypsercalers and enterprise customers need this.

Then let's design something proper.  It's not that hard.  No one
needs to support anything, and between all the major design issues
in FDP and all the political crap thrown by Meta and Samsuns at the
interests of Linux there's no way I can support this.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-07-03 15:08 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20240702103348epcas5p1015eafbddf4795558843cd74b0453b12@epcas5p1.samsung.com>
2024-07-02 10:26 ` [PATCH v3] nvme: enable FDP support Kanchan Joshi
2024-07-02 11:39   ` Christoph Hellwig
2024-07-02 15:15     ` Keith Busch
2024-07-02 15:36       ` Christoph Hellwig
2024-07-02 15:49         ` Jens Axboe
2024-07-02 15:51           ` Christoph Hellwig
2024-07-03  8:40     ` Kanchan Joshi
2024-07-03 15:08       ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox