* [PATCH v2] nvme: enable FDP support [not found] <CGME20240528151007epcas5p32583675f647553923e5ba4987e9bc6ed@epcas5p3.samsung.com> @ 2024-05-28 15:02 ` Kanchan Joshi 2024-06-07 15:14 ` Keith Busch 0 siblings, 1 reply; 12+ messages in thread From: Kanchan Joshi @ 2024-05-28 15:02 UTC (permalink / raw) To: axboe, kbusch, hch, sagi Cc: linux-nvme, javier.gonz, bvanassche, gost.dev, Kanchan Joshi, Hui Qi, Nitesh Shetty Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host to control the placement of logical blocks so as to reduce the SSD WAF. Userspace can send the data lifetime information using the write hints. The SCSI driver (sd) can already pass this information to the SCSI devices. This patch does the same for NVMe. Fetches the placement-identifiers (plids) if the device supports FDP. And map the incoming write-hints to plids. Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> Signed-off-by: Hui Qi <hui81.qi@samsung.com> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com> --- Changes since v1: - Reduce the fetched plids from 128 to 6 (Keith) - Use struct_size for a calculation (Keith) - Handle robot/sparse warning drivers/nvme/host/core.c | 67 ++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 4 +++ include/linux/nvme.h | 19 ++++++++++++ 3 files changed, 90 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 954f850f113a..9b67c3afe003 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -42,6 +42,20 @@ struct nvme_ns_info { bool is_removed; }; +struct nvme_fdp_ruh_status_desc { + u16 pid; + u16 ruhid; + u32 earutr; + u64 ruamw; + u8 rsvd16[16]; +}; + +struct nvme_fdp_ruh_status { + u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhsd[]; +}; + unsigned int admin_timeout = 60; module_param(admin_timeout, uint, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); @@ -922,6 +936,16 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, return BLK_STS_OK; } +static inline void nvme_assign_placement_id(struct nvme_ns *ns, + struct request *req, + struct nvme_command *cmd) +{ + enum rw_hint h = umin(ns->head->nr_plids - 1, req->write_hint); + + cmd->rw.control |= cpu_to_le16(NVME_RW_DTYPE_DPLCMT); + cmd->rw.dsmgmt |= cpu_to_le32(ns->head->plids[h] << 16); +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_opcode op) @@ -1037,6 +1061,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) break; case REQ_OP_WRITE: ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + if (!ret && ns->head->nr_plids) + nvme_assign_placement_id(ns, req, cmd); break; case REQ_OP_ZONE_APPEND: ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); @@ -2049,6 +2075,40 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, return ret; } +static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid) +{ + struct nvme_command c = {}; + struct nvme_fdp_ruh_status *ruhs; + struct nvme_fdp_ruh_status_desc *ruhsd; + int size, ret, i; + + size = struct_size(ruhs, ruhsd, NVME_MAX_PLIDS); + ruhs = kzalloc(size, GFP_KERNEL); + if (!ruhs) + return -ENOMEM; + + c.imr.opcode = nvme_cmd_io_mgmt_recv; + c.imr.nsid = cpu_to_le32(nsid); + c.imr.mo = 0x1; + c.imr.numd = cpu_to_le32((size >> 2) - 1); + + ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size); + if (ret) + goto out; + + ns->head->nr_plids = le16_to_cpu(ruhs->nruhsd); + ns->head->nr_plids = + min_t(u16, ns->head->nr_plids, NVME_MAX_PLIDS); + + for (i = 0; i < ns->head->nr_plids; i++) { + ruhsd = &ruhs->ruhsd[i]; + ns->head->plids[i] = le16_to_cpu(ruhsd->pid); + } +out: + kfree(ruhs); + return ret; +} + static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { @@ -2136,6 +2196,13 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (ret && !nvme_first_scan(ns->disk)) goto out; } + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) { + ret = nvme_fetch_fdp_plids(ns, info->nsid); + if (ret) + dev_warn(ns->ctrl->device, + "FDP failure status:0x%x\n", ret); + } + ret = 0; out: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cacc56f4bbf4..bec3024d6af9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -445,6 +445,8 @@ struct nvme_ns_ids { u8 csi; }; +#define NVME_MAX_PLIDS (WRITE_LIFE_EXTREME + 1) + /* * Anchor structure for namespaces. There is one for each namespace in a * NVMe subsystem that any of our controllers can see, and the namespace @@ -462,6 +464,8 @@ struct nvme_ns_head { bool shared; bool passthru_err_log_enabled; int instance; + u16 nr_plids; + u16 plids[NVME_MAX_PLIDS]; struct nvme_effects_log *effects; u64 nuse; unsigned ns_id; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 425573202295..fc07ba1b5ec5 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -270,6 +270,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), + NVME_CTRL_ATTR_FDPS = (1 << 19), }; struct nvme_id_ctrl { @@ -829,6 +830,7 @@ enum nvme_opcode { nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, + nvme_cmd_io_mgmt_recv = 0x12, nvme_cmd_resv_release = 0x15, nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, @@ -850,6 +852,7 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_register), \ nvme_opcode_name(nvme_cmd_resv_report), \ nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_io_mgmt_recv), \ nvme_opcode_name(nvme_cmd_resv_release), \ nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ @@ -1001,6 +1004,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_RW_DTYPE_DPLCMT = 2 << 4, NVME_WZ_DEAC = 1 << 9, }; @@ -1088,6 +1092,20 @@ struct nvme_zone_mgmt_recv_cmd { __le32 cdw14[2]; }; +struct nvme_io_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 mo; + __u8 rsvd11; + __u16 mos; + __le32 numd; + __le32 cdw12[4]; +}; + enum { NVME_ZRA_ZONE_REPORT = 0, NVME_ZRASF_ZONE_REPORT_ALL = 0, @@ -1808,6 +1826,7 @@ struct nvme_command { struct nvmf_auth_receive_command auth_receive; struct nvme_dbbuf dbbuf; struct nvme_directive_cmd directive; + struct nvme_io_mgmt_recv_cmd imr; }; }; -- 2.25.1 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-05-28 15:02 ` [PATCH v2] nvme: enable FDP support Kanchan Joshi @ 2024-06-07 15:14 ` Keith Busch 2024-06-08 5:17 ` Christoph Hellwig 0 siblings, 1 reply; 12+ messages in thread From: Keith Busch @ 2024-06-07 15:14 UTC (permalink / raw) To: Kanchan Joshi Cc: axboe, hch, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Tue, May 28, 2024 at 08:32:33PM +0530, Kanchan Joshi wrote: > Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host > to control the placement of logical blocks so as to reduce the SSD WAF. > > Userspace can send the data lifetime information using the write hints. > The SCSI driver (sd) can already pass this information to the SCSI > devices. This patch does the same for NVMe. > > Fetches the placement-identifiers (plids) if the device supports FDP. > And map the incoming write-hints to plids. Looks good to me. I'll queue this up for 6.11 after we can rebase to the block 6.11 tree once that's created. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-07 15:14 ` Keith Busch @ 2024-06-08 5:17 ` Christoph Hellwig 2024-06-10 10:38 ` Kanchan Joshi 0 siblings, 1 reply; 12+ messages in thread From: Christoph Hellwig @ 2024-06-08 5:17 UTC (permalink / raw) To: Keith Busch Cc: Kanchan Joshi, axboe, hch, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty Nacked-by: Christoph Hellwig <hch@lst.de> This is a really overcomplicated way that does not in any way fit the FDP use case (which is also rather overcomplicated). If you want to pass on life time information to a NVMe device please work with the NVMe technical working group to add the equivalent of the life time hints added to SCSI as part of the constrained streams in SBC. It would be implementable in the Linux nvme driver in a few lines of code, similaly trivially in the device and actually makes things work. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-08 5:17 ` Christoph Hellwig @ 2024-06-10 10:38 ` Kanchan Joshi 2024-06-10 11:27 ` Martin K. Petersen 0 siblings, 1 reply; 12+ messages in thread From: Kanchan Joshi @ 2024-06-10 10:38 UTC (permalink / raw) To: Christoph Hellwig, Keith Busch Cc: axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On 6/8/2024 10:47 AM, Christoph Hellwig wrote: > Nacked-by: Christoph Hellwig<hch@lst.de> > > This is a really overcomplicated way that does not in any way fit the > FDP use case (which is also rather overcomplicated). > > If you want to pass on life time information to a NVMe device please > work with the NVMe technical working group to add the equivalent of > the life time hints added to SCSI as part of the > constrained streams in SBC. It would be implementable in the Linux > nvme driver in a few lines of code, similaly trivially in the device > and actually makes things work. I disagree. Bart's patch using "constrained streams in SBC" adds 104 lines [*]. This patch adds 90 lines. Both do the conceptually similar two steps equally trivially: Step #1. Query protocol specific hints. - sd_read_io_hints (scsi): sends one/two commands to the device - nvme_fetch_fdp_plids (nvme): sends one command to the device Step #2. map write-hints to protocol specific hints. - sd_group_number (scsi) - nvme_assign_placement_id (nvme) So current plumbing is nearly identical and as simple as SCSI. And TP 4146 author list shows 10 companies. Perhaps good enough for diverse opinions on how to go about things before settling down. We have used passthrough interface for FDP wherever possible. But there are users asking file/block interface for what has been standardized. We end up sharing this patch as a off-tree solution (which is a pity). And this was discussed at LSFMM too; There was no push back to move forward. [*] https://lore.kernel.org/linux-scsi/20240222214508.1630719-4-bvanassche@acm.org/ ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-10 10:38 ` Kanchan Joshi @ 2024-06-10 11:27 ` Martin K. Petersen 2024-06-10 11:53 ` Javier González 2024-06-10 11:55 ` [PATCH v2] " Christoph Hellwig 0 siblings, 2 replies; 12+ messages in thread From: Martin K. Petersen @ 2024-06-10 11:27 UTC (permalink / raw) To: Kanchan Joshi Cc: Christoph Hellwig, Keith Busch, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty Hi Kanchan! > So current plumbing is nearly identical and as simple as SCSI. I don't have a problem with your implementation, however... > And TP 4146 author list shows 10 companies. Perhaps good enough for > diverse opinions on how to go about things before settling down. I think FDP and SCSI streams are less than ideal. 10+ years and the standards bodies still haven't been able to produce an approach that makes sense in the context of a general purpose operating system. -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nvme: enable FDP support 2024-06-10 11:27 ` Martin K. Petersen @ 2024-06-10 11:53 ` Javier González 2024-06-10 11:55 ` [PATCH v2] " Christoph Hellwig 1 sibling, 0 replies; 12+ messages in thread From: Javier González @ 2024-06-10 11:53 UTC (permalink / raw) To: Martin K. Petersen Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch, axboe, sagi, linux-nvme, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On 10.06.2024 07:27, Martin K. Petersen wrote: > >Hi Kanchan! > >> So current plumbing is nearly identical and as simple as SCSI. > >I don't have a problem with your implementation, however... > >> And TP 4146 author list shows 10 companies. Perhaps good enough for >> diverse opinions on how to go about things before settling down. > >I think FDP and SCSI streams are less than ideal. 10+ years and the >standards bodies still haven't been able to produce an approach that >makes sense in the context of a general purpose operating system. I want to believe that in each iteration we are getting closer to something that can actually be deployed without major OS and application changes. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-10 11:27 ` Martin K. Petersen 2024-06-10 11:53 ` Javier González @ 2024-06-10 11:55 ` Christoph Hellwig 2024-06-10 14:52 ` Keith Busch 1 sibling, 1 reply; 12+ messages in thread From: Christoph Hellwig @ 2024-06-10 11:55 UTC (permalink / raw) To: Martin K. Petersen Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Mon, Jun 10, 2024 at 07:27:16AM -0400, Martin K. Petersen wrote: > > And TP 4146 author list shows 10 companies. Perhaps good enough for > > diverse opinions on how to go about things before settling down. > > I think FDP and SCSI streams are less than ideal. 10+ years and the > standards bodies still haven't been able to produce an approach that > makes sense in the context of a general purpose operating system. The SCSI temperature hints are I think exactly what we want. The fact that they had to be shoe horned in in a weird to claim to be streams is a little awkward, but sometimes that's need to get things done. Hint to Samsung and Meta: if you have to hack them into NVMe to still claim it's FDP so that you marketing budget isn't lost we can probably live with that.. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-10 11:55 ` [PATCH v2] " Christoph Hellwig @ 2024-06-10 14:52 ` Keith Busch 2024-06-11 5:47 ` Christoph Hellwig 0 siblings, 1 reply; 12+ messages in thread From: Keith Busch @ 2024-06-10 14:52 UTC (permalink / raw) To: Christoph Hellwig Cc: Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Mon, Jun 10, 2024 at 01:55:41PM +0200, Christoph Hellwig wrote: > On Mon, Jun 10, 2024 at 07:27:16AM -0400, Martin K. Petersen wrote: > > > And TP 4146 author list shows 10 companies. Perhaps good enough for > > > diverse opinions on how to go about things before settling down. > > > > I think FDP and SCSI streams are less than ideal. 10+ years and the > > standards bodies still haven't been able to produce an approach that > > makes sense in the context of a general purpose operating system. > > The SCSI temperature hints are I think exactly what we want. The fact > that they had to be shoe horned in in a weird to claim to be streams > is a little awkward, but sometimes that's need to get things done. I feel a bit out of the loop here, I'm not sure what the concern is. I agree the FDP setup is complicated, but none of that is taken on by the driver. It just discovers the capabilities and maps an arbitrary software "hint" to an arbitrary device "hint". It's up to the application to use those optimally; the driver just performs the requested mapping. Is it because the names of those hints indicate data lifetime? These are just arbitrary numbers used by applications to separate placement. If they were called HINT_A, HINT_B, HINT_C, would that make this ok? ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-10 14:52 ` Keith Busch @ 2024-06-11 5:47 ` Christoph Hellwig 2024-06-11 14:32 ` Keith Busch 0 siblings, 1 reply; 12+ messages in thread From: Christoph Hellwig @ 2024-06-11 5:47 UTC (permalink / raw) To: Keith Busch Cc: Christoph Hellwig, Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Mon, Jun 10, 2024 at 08:52:12AM -0600, Keith Busch wrote: > I agree the FDP setup is complicated, but none of that is taken on by > the driver. It just discovers the capabilities and maps an arbitrary > software "hint" to an arbitrary device "hint". It's up to the > application to use those optimally; the driver just performs the > requested mapping. > > Is it because the names of those hints indicate data lifetime? These are > just arbitrary numbers used by applications to separate placement. If > they were called HINT_A, HINT_B, HINT_C, would that make this ok? No, the other problem is that FDP very much has an implicit contract that the host actually aligns to it resources units, and actually has a really complicated mangement. It's not a simple throw a lifetime hint at the drive. Note that the implicit is indeed very implicit - it is a really horrible spec with a lot of assumptions but nothing actually enforcing it. If you just use it for dumb lifetime hints changes are that you actually increase write amplificiation. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-11 5:47 ` Christoph Hellwig @ 2024-06-11 14:32 ` Keith Busch 2024-06-11 19:43 ` Martin K. Petersen 0 siblings, 1 reply; 12+ messages in thread From: Keith Busch @ 2024-06-11 14:32 UTC (permalink / raw) To: Christoph Hellwig Cc: Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Tue, Jun 11, 2024 at 07:47:26AM +0200, Christoph Hellwig wrote: > On Mon, Jun 10, 2024 at 08:52:12AM -0600, Keith Busch wrote: > > I agree the FDP setup is complicated, but none of that is taken on by > > the driver. It just discovers the capabilities and maps an arbitrary > > software "hint" to an arbitrary device "hint". It's up to the > > application to use those optimally; the driver just performs the > > requested mapping. > > > > Is it because the names of those hints indicate data lifetime? These are > > just arbitrary numbers used by applications to separate placement. If > > they were called HINT_A, HINT_B, HINT_C, would that make this ok? > > No, the other problem is that FDP very much has an implicit contract > that the host actually aligns to it resources units, and actually > has a really complicated mangement. It's not a simple throw a > lifetime hint at the drive. Note that the implicit is indeed very > implicit - it is a really horrible spec with a lot of assumptions > but nothing actually enforcing it. If you just use it for dumb > lifetime hints changes are that you actually increase write > amplificiation. NVMe has various features that recommend many things, but none of them are enforced (see NOWS, NPDA, NPWA, etc...). We expect applications to act in good faith, but nothing is enforced at the protocol level because it is difficult to bring on useful software otherwise. You just won't maximize benefits if you don't align, and FDP is no different. The fact that setting up a device to use FDP is such a pain is a clear indication of the user's intentions and responisibilities for using it. Yeah, a degenerate application abusing FDP semantics is worse for performance and device wear than doing nothing at all, but why should anyone care about that? One thing FDP got right was mandating the Endurance Log: the drive must provide a feedback mechanism for the host to know if what they're doing is helpful or harmful. If you're just blindly throwing random fcntl hints, then you're not the target audience for the feature; you're expected to iterate and tweak your usage. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-11 14:32 ` Keith Busch @ 2024-06-11 19:43 ` Martin K. Petersen 2024-06-11 22:42 ` Keith Busch 0 siblings, 1 reply; 12+ messages in thread From: Martin K. Petersen @ 2024-06-11 19:43 UTC (permalink / raw) To: Keith Busch Cc: Christoph Hellwig, Martin K. Petersen, Kanchan Joshi, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty Hi Keith! > One thing FDP got right was mandating the Endurance Log: the drive > must provide a feedback mechanism for the host to know if what they're > doing is helpful or harmful. Good luck teaching firefox what to do with that information! > If you're just blindly throwing random fcntl hints, then you're not > the target audience for the feature; you're expected to iterate and > tweak your usage. And that's exactly my point. What the various attempts at data management in the specs have in common is that they are unsuitable for a general purpose operating system and its applications. We can all come up with a restrictive model which works beautifully for one particular application. No problem. But that's not what standards are supposed to be about! We used to produce specifications which worked for every type of application and device. It was a beautiful thing when we went away from cylinders, heads, and sectors as tools to do performance management on storage. An abstracted model for managing blocks that has worked for everything from USB flash drives, over spinning rust, to million dollar storage arrays. With one protocol. For decades. And still going. Because the abstraction worked, and it removed the burden of having to care about device implementation artifacts from applications and operating systems alike. We need a similar model for data management. Something which works well enough on the device media management side but which transcends one particular application or device implementation. I really don't believe any of the currently defined data management schemes are timeless the same way as LBAs have proven to be... -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] nvme: enable FDP support 2024-06-11 19:43 ` Martin K. Petersen @ 2024-06-11 22:42 ` Keith Busch 0 siblings, 0 replies; 12+ messages in thread From: Keith Busch @ 2024-06-11 22:42 UTC (permalink / raw) To: Martin K. Petersen Cc: Christoph Hellwig, Kanchan Joshi, axboe, sagi, linux-nvme, javier.gonz, bvanassche, gost.dev, Hui Qi, Nitesh Shetty On Tue, Jun 11, 2024 at 03:43:11PM -0400, Martin K. Petersen wrote: d > > Hi Keith! > > > One thing FDP got right was mandating the Endurance Log: the drive > > must provide a feedback mechanism for the host to know if what they're > > doing is helpful or harmful. > > Good luck teaching firefox what to do with that information! > > > If you're just blindly throwing random fcntl hints, then you're not > > the target audience for the feature; you're expected to iterate and > > tweak your usage. > > And that's exactly my point. What the various attempts at data > management in the specs have in common is that they are unsuitable for a > general purpose operating system and its applications. > > We can all come up with a restrictive model which works beautifully for > one particular application. No problem. But that's not what standards > are supposed to be about! We used to produce specifications which worked > for every type of application and device. FDP isn't a user distro level feature. This is expert level, and is not reachable by default; you really need admin know-how to make a namespace that recognizes these semantics. The machines that can reach *this* feature are most certainly headless servers, so consumer use cases aren't considered yet! All in good time (maybe). And as I don asbestos underwear and dare say, Linux enables enterprise storage capabilities for filesystems not reachable for the average user (DAX), so we (Linux) are not exactly fencing off difficult to use features. FDP doesn't even require new kernel interface changes to wire it up to filesystems, so there's no additional maintenance burden here. From a pure block and NVMe maintenance point of view, this is nothing. I do have gripes with the *kernel* interfaces, though. Mainly that it's per inode which makes this useless with raw block IO. We started FDP with the passthrough interface, and that proved usage produces meaningful gains, so hooking this into existing data separation provided by fnctl feels like a natural progression despite its limitations. I think enabling such experimentation can only help make these interfaces become better and enlighten future protocol changes. > It was a beautiful thing when we went away from cylinders, heads, and > sectors as tools to do performance management on storage. An abstracted > model for managing blocks that has worked for everything from USB flash > drives, over spinning rust, to million dollar storage arrays. With one > protocol. For decades. And still going. Because the abstraction worked, > and it removed the burden of having to care about device implementation > artifacts from applications and operating systems alike. > > We need a similar model for data management. Something which works well > enough on the device media management side but which transcends one > particular application or device implementation. I really don't believe > any of the currently defined data management schemes are timeless the > same way as LBAs have proven to be... No disagreement here! NVMe 1.0 defined Read/Write CDW13 DSM field with what I think are almost the desired semantics. 12 years later, no one implemented it, but mark my words: we'll circle back to something similiar in 12 more years. ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2024-06-11 22:42 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <CGME20240528151007epcas5p32583675f647553923e5ba4987e9bc6ed@epcas5p3.samsung.com>
2024-05-28 15:02 ` [PATCH v2] nvme: enable FDP support Kanchan Joshi
2024-06-07 15:14 ` Keith Busch
2024-06-08 5:17 ` Christoph Hellwig
2024-06-10 10:38 ` Kanchan Joshi
2024-06-10 11:27 ` Martin K. Petersen
2024-06-10 11:53 ` Javier González
2024-06-10 11:55 ` [PATCH v2] " Christoph Hellwig
2024-06-10 14:52 ` Keith Busch
2024-06-11 5:47 ` Christoph Hellwig
2024-06-11 14:32 ` Keith Busch
2024-06-11 19:43 ` Martin K. Petersen
2024-06-11 22:42 ` Keith Busch
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox