[PATCH 0/2] Unprivileged sgl-only passthrough

public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH 0/2] Unprivileged sgl-only passthrough
       [not found] <CGME20231018183620epcas5p26ab74bdd1f2739ef3ec1ee2431329dc4@epcas5p2.samsung.com>
@ 2023-10-18 18:30 ` Kanchan Joshi
  2023-10-18 18:30   ` [PATCH 1/2] nvme-pci: meta-transfer via sgl Kanchan Joshi
                     ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 18:30 UTC (permalink / raw)
  To: hch, kbusch, axboe, sagi; +Cc: linux-nvme, gost.dev, joshiiitr, Kanchan Joshi

Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
Patch 2: Enables fine-granular passthrough with the change that i/o
commands can transfer the data only via SGL.

Requirement:
- Prepared against block 6.6 tree.
- The patch in uring-passthrough failure handling is required to see the
  submission failure (if any)
  https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/

Kanchan Joshi (2):
  nvme-pci: meta-transfer via sgl
  nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands

 drivers/nvme/host/ioctl.c | 162 ++++++++++++++++++++++++++++++--------
 drivers/nvme/host/nvme.h  |   6 ++
 drivers/nvme/host/pci.c   |  63 +++++++++++++--
 include/linux/nvme.h      |   2 +
 4 files changed, 194 insertions(+), 39 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] nvme-pci: meta-transfer via sgl
  2023-10-18 18:30 ` [PATCH 0/2] Unprivileged sgl-only passthrough Kanchan Joshi
@ 2023-10-18 18:30   ` Kanchan Joshi
  2023-10-19  5:48     ` Christoph Hellwig
  2023-10-18 18:30   ` [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands Kanchan Joshi
  2023-10-18 18:40   ` [PATCH 0/2] Unprivileged sgl-only passthrough Jens Axboe
  2 siblings, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 18:30 UTC (permalink / raw)
  To: hch, kbusch, axboe, sagi; +Cc: linux-nvme, gost.dev, joshiiitr, Kanchan Joshi

Introduce the ability to transfer the metadata buffer using sgl.
Also add a nvme request flag 'NVME_REQ_FORCE_SGL' that mandates both
data and meta transfer via sgl.

This is a prepatory patch to enable unprivileged passthrough via SGL.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 drivers/nvme/host/nvme.h |  6 ++++
 drivers/nvme/host/pci.c  | 63 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f35647c470af..58f8efe1ace9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -184,6 +184,7 @@ enum {
 	NVME_REQ_CANCELLED		= (1 << 0),
 	NVME_REQ_USERCMD		= (1 << 1),
 	NVME_MPATH_IO_STATS		= (1 << 2),
+	NVME_REQ_FORCE_SGL		= (1 << 3),
 };
 
 static inline struct nvme_request *nvme_req(struct request *req)
@@ -1043,6 +1044,11 @@ static inline void nvme_start_request(struct request *rq)
 	blk_mq_start_request(rq);
 }
 
+static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
+{
+	return ctrl->sgls & (1 << 19);
+}
+
 static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 {
 	return ctrl->sgls & ((1 << 0) | (1 << 1));
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3f0c9ee09a12..1907b1c9919a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -123,6 +123,7 @@ struct nvme_dev {
 	struct device *dev;
 	struct dma_pool *prp_page_pool;
 	struct dma_pool *prp_small_pool;
+	struct dma_pool *meta_sgl_pool;
 	unsigned online_queues;
 	unsigned max_qid;
 	unsigned io_queues[HCTX_MAX_TYPES];
@@ -236,6 +237,8 @@ struct nvme_iod {
 	unsigned int dma_len;	/* length of single DMA segment mapping */
 	dma_addr_t first_dma;
 	dma_addr_t meta_dma;
+	dma_addr_t meta_dma_sg;
+	struct nvme_sgl_desc	*meta_sgl;
 	struct sg_table sgt;
 	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
@@ -772,18 +775,23 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret = BLK_STS_RESOURCE;
 	int rc;
+	bool force_sgl = nvme_req(req)->flags & NVME_REQ_FORCE_SGL;
+
+	if (force_sgl && !nvme_ctrl_sgl_supported(&dev->ctrl))
+		return BLK_STS_IOERR;
 
 	if (blk_rq_nr_phys_segments(req) == 1) {
 		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 		struct bio_vec bv = req_bvec(req);
 
 		if (!is_pci_p2pdma_page(bv.bv_page)) {
-			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+			if (!force_sgl &&
+			    bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
 				return nvme_setup_prp_simple(dev, req,
 							     &cmnd->rw, &bv);
 
-			if (nvmeq->qid && sgl_threshold &&
-			    nvme_ctrl_sgl_supported(&dev->ctrl))
+			if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)
+			    && (sgl_threshold || force_sgl))
 				return nvme_setup_sgl_simple(dev, req,
 							     &cmnd->rw, &bv);
 		}
@@ -806,7 +814,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		goto out_free_sg;
 	}
 
-	if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
+	if (force_sgl || nvme_pci_use_sgls(dev, req, iod->sgt.nents))
 		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 	else
 		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
@@ -825,13 +833,44 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	bool force_sgl = nvme_req(req)->flags & NVME_REQ_FORCE_SGL;
+	blk_status_t ret;
 
 	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
 			rq_dma_dir(req), 0);
 	if (dma_mapping_error(dev->dev, iod->meta_dma))
 		return BLK_STS_IOERR;
-	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+
+	if (!force_sgl) {
+		cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+		return BLK_STS_OK;
+	}
+
+	if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) {
+		WARN_ONCE(1, "controller does not support meta sgl.");
+		ret = BLK_STS_IOERR;
+		goto out_unmap;
+	}
+
+	iod->meta_sgl = dma_pool_alloc(dev->meta_sgl_pool, GFP_KERNEL,
+				       &iod->meta_dma_sg);
+	if (!iod->meta_sgl) {
+		ret = BLK_STS_IOERR;
+		goto out_unmap;
+	}
+
+	iod->meta_sgl->addr = cpu_to_le64(iod->meta_dma);
+	iod->meta_sgl->length = cpu_to_le32(rq_integrity_vec(req)->bv_len);
+	iod->meta_sgl->type = NVME_SGL_FMT_DATA_DESC << 4;
+	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma_sg);
+	cmnd->rw.flags = NVME_CMD_SGL_METASEG;
+
 	return BLK_STS_OK;
+
+out_unmap:
+	dma_unmap_page(dev->dev, iod->meta_dma,
+			rq_integrity_vec(req)->bv_len, rq_dma_dir(req));
+	return ret;
 }
 
 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
@@ -968,6 +1007,11 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
 
 		dma_unmap_page(dev->dev, iod->meta_dma,
 			       rq_integrity_vec(req)->bv_len, rq_dma_dir(req));
+
+		if (nvme_req(req)->flags & NVME_REQ_FORCE_SGL)
+			dma_pool_free(dev->meta_sgl_pool,
+				      (void *)iod->meta_sgl,
+				      iod->meta_dma_sg);
 	}
 
 	if (blk_rq_nr_phys_segments(req))
@@ -2644,6 +2688,14 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
 		dma_pool_destroy(dev->prp_page_pool);
 		return -ENOMEM;
 	}
+	/* for metadata sgl */
+	dev->meta_sgl_pool = dma_pool_create("meta sg 16", dev->dev, 16, 16, 0);
+	if (!dev->meta_sgl_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		dma_pool_destroy(dev->prp_small_pool);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -2651,6 +2703,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 {
 	dma_pool_destroy(dev->prp_page_pool);
 	dma_pool_destroy(dev->prp_small_pool);
+	dma_pool_destroy(dev->meta_sgl_pool);
 }
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] nvme-pci: meta-transfer via sgl
  2023-10-18 18:30   ` [PATCH 1/2] nvme-pci: meta-transfer via sgl Kanchan Joshi
@ 2023-10-19  5:48     ` Christoph Hellwig
  2023-10-19  9:54       ` Kanchan Joshi
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:48 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: hch, kbusch, axboe, sagi, linux-nvme, gost.dev, joshiiitr

On Thu, Oct 19, 2023 at 12:00:02AM +0530, Kanchan Joshi wrote:
> Introduce the ability to transfer the metadata buffer using sgl.
> Also add a nvme request flag 'NVME_REQ_FORCE_SGL' that mandates both
> data and meta transfer via sgl.

That's really two different changes, that should be separate patches.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] nvme-pci: meta-transfer via sgl
  2023-10-19  5:48     ` Christoph Hellwig
@ 2023-10-19  9:54       ` Kanchan Joshi
  2023-10-20  4:40         ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-19  9:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kbusch, axboe, sagi, linux-nvme, gost.dev, joshiiitr

On 10/19/2023 11:18 AM, Christoph Hellwig wrote:
> On Thu, Oct 19, 2023 at 12:00:02AM +0530, Kanchan Joshi wrote:
>> Introduce the ability to transfer the metadata buffer using sgl.
>> Also add a nvme request flag 'NVME_REQ_FORCE_SGL' that mandates both
>> data and meta transfer via sgl.
> 
> That's really two different changes, that should be separate patches.

Do you mean a separate patch with just this flag?

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f35647c470af..58f8efe1ace9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -184,6 +184,7 @@ enum {
         NVME_REQ_CANCELLED              = (1 << 0),
         NVME_REQ_USERCMD                = (1 << 1),
         NVME_MPATH_IO_STATS             = (1 << 2),
+       NVME_REQ_FORCE_SGL              = (1 << 3),
  };


I thought about it, but did not feel strong about it as all the new code 
was really tied to this flag.
I think the issue is more about the commit message. I can just reword 
the commit description make the connection explicit. Something like this-

Add a nvme request flag 'NVME_REQ_FORCE_SGL' that mandates both
data and meta transfer via sgl. Meta-transfer via SGL is a new ability 
that kicks in only when this flag is present.


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] nvme-pci: meta-transfer via sgl
  2023-10-19  9:54       ` Kanchan Joshi
@ 2023-10-20  4:40         ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2023-10-20  4:40 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, kbusch, axboe, sagi, linux-nvme, gost.dev,
	joshiiitr

On Thu, Oct 19, 2023 at 03:24:12PM +0530, Kanchan Joshi wrote:
> Do you mean a separate patch with just this flag?

Most importantly split out the metadata SGL support in it's own patch
as it is a feature on it's own.  After that a flag just for the flag
is indeed a bit silly, so maybe merged into the next one.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands
  2023-10-18 18:30 ` [PATCH 0/2] Unprivileged sgl-only passthrough Kanchan Joshi
  2023-10-18 18:30   ` [PATCH 1/2] nvme-pci: meta-transfer via sgl Kanchan Joshi
@ 2023-10-18 18:30   ` Kanchan Joshi
  2023-10-19  5:49     ` Christoph Hellwig
  2023-10-18 18:40   ` [PATCH 0/2] Unprivileged sgl-only passthrough Jens Axboe
  2 siblings, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 18:30 UTC (permalink / raw)
  To: hch, kbusch, axboe, sagi; +Cc: linux-nvme, gost.dev, joshiiitr, Kanchan Joshi

Passthrough commands are guarded by heavy-handed CAP_SYS_ADMIN checks
that neglects the file-mode completely.

Add a fine-granular policy that considers file-mode and various other
parameters for any approval/denial. Also enable few admin commands that
are necssary to get the information required to form the i/o commands.
The policy is implemented in a new helper 'nvme_cmd_allowed'.

I/O commands needing data-transfers are sent only via SGL.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 drivers/nvme/host/ioctl.c | 162 ++++++++++++++++++++++++++++++--------
 include/linux/nvme.h      |   2 +
 2 files changed, 130 insertions(+), 34 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index eb2ef3e14961..3d0d91633946 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -10,8 +10,80 @@
 
 enum {
 	NVME_IOCTL_VEC		= (1 << 0),
+	NVME_IOCTL_PARTITION	= (1 << 1),
 };
 
+static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
+		unsigned int flags, bool open_for_write)
+{
+	u32 effects;
+
+	if (capable(CAP_SYS_ADMIN))
+		return true;
+
+	/*
+	 * Do not allow unprivileged passthrough on partitions, as that allows an
+	 * escape from the containment of the partition.
+	 */
+	if (flags & NVME_IOCTL_PARTITION)
+		return false;
+
+	/*
+	 * Do not allow unprivileged processes to send vendor specific or fabrics
+	 * commands as we can't be sure about their effects.
+	 */
+	if (c->common.opcode >= nvme_cmd_vendor_start ||
+	    c->common.opcode == nvme_fabrics_command)
+		return false;
+
+	/*
+	 * Do not allow unprivileged passthrough of admin commands except
+	 * for a subset of identify commands that contain information required
+	 * to form proper I/O commands in userspace and do not expose any
+	 * potentially sensitive information.
+	 */
+	if (!ns) {
+		if (c->common.opcode == nvme_admin_identify) {
+			switch (c->identify.cns) {
+			case NVME_ID_CNS_NS:
+			case NVME_ID_CNS_CS_NS:
+			case NVME_ID_CNS_NS_CS_INDEP:
+			case NVME_ID_CNS_CS_CTRL:
+			case NVME_ID_CNS_CTRL:
+				return true;
+			}
+		}
+		return false;
+	}
+
+	/*
+	 * Check if the controller provides a Commands Supported and Effects log
+	 * and marks this command as supported.  If not reject unprivileged
+	 * passthrough.
+	 */
+	effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
+	if (!(effects & NVME_CMD_EFFECTS_CSUPP))
+		return false;
+
+	/*
+	 * Don't allow passthrough for command that have intrusive (or unknown)
+	 * effects.
+	 */
+	if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+			NVME_CMD_EFFECTS_UUID_SEL |
+			NVME_CMD_EFFECTS_SCOPE_MASK))
+		return false;
+
+	/*
+	 * Only allow I/O commands that transfer data to the controller or that
+	 * change the logical block contents if the file descriptor is open for
+	 * writing.
+	 */
+	if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC))
+		return open_for_write;
+	return true;
+}
+
 /*
  * Convert integer values from ioctl structures to user pointers, silently
  * ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -134,6 +206,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		}
 		*metap = meta;
 	}
+	/* Data transfer for unprivileged passthrough is only via SGL */
+	if (bdev && !capable(CAP_SYS_ADMIN))
+		nvme_req(req)->flags |= NVME_REQ_FORCE_SGL;
 
 	return ret;
 
@@ -267,7 +342,8 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd __user *ucmd)
+		struct nvme_passthru_cmd __user *ucmd, unsigned int flags,
+		bool open_for_write)
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
@@ -275,8 +351,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	u64 result;
 	int status;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
 	if (cmd.flags)
@@ -297,6 +371,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
+	if (!nvme_cmd_allowed(ns, &c, 0, open_for_write))
+		return -EACCES;
+
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
@@ -313,16 +390,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd64 __user *ucmd,
-			unsigned int flags)
+		struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags,
+		bool open_for_write)
 {
 	struct nvme_passthru_cmd64 cmd;
 	struct nvme_command c;
 	unsigned timeout = 0;
 	int status;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
 	if (cmd.flags)
@@ -343,6 +418,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
+	if (!nvme_cmd_allowed(ns, &c, flags, open_for_write))
+		return -EACCES;
+
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
@@ -492,9 +570,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	void *meta = NULL;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
 	c.common.opcode = READ_ONCE(cmd->opcode);
 	c.common.flags = READ_ONCE(cmd->flags);
 	if (c.common.flags)
@@ -516,6 +591,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
 	c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
 
+	if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE))
+		return -EACCES;
+
 	d.metadata = READ_ONCE(cmd->metadata);
 	d.addr = READ_ONCE(cmd->addr);
 	d.data_len = READ_ONCE(cmd->data_len);
@@ -572,13 +650,13 @@ static bool is_ctrl_ioctl(unsigned int cmd)
 }
 
 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
-		void __user *argp)
+		void __user *argp, bool open_for_write)
 {
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ctrl, NULL, argp);
+		return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp, 0);
+		return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
 	default:
 		return sed_ioctl(ctrl->opal_dev, cmd, argp);
 	}
@@ -603,16 +681,14 @@ struct nvme_user_io32 {
 #endif /* COMPAT_FOR_U64_ALIGNMENT */
 
 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
-		void __user *argp)
+		void __user *argp, unsigned int flags, bool open_for_write)
 {
-	unsigned int flags = 0;
-
 	switch (cmd) {
 	case NVME_IOCTL_ID:
 		force_successful_syscall_return();
 		return ns->head->ns_id;
 	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(ns->ctrl, ns, argp);
+		return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write);
 	/*
 	 * struct nvme_user_io can have different padding on some 32-bit ABIs.
 	 * Just accept the compat version as all fields that are used are the
@@ -627,32 +703,39 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
 		flags |= NVME_IOCTL_VEC;
 		fallthrough;
 	case NVME_IOCTL_IO64_CMD:
-		return nvme_user_cmd64(ns->ctrl, ns, argp, flags);
+		return nvme_user_cmd64(ns->ctrl, ns, argp, flags,
+				       open_for_write);
 	default:
 		return -ENOTTY;
 	}
 }
 
-int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+int nvme_ioctl(struct block_device *bdev, blk_mode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
 	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	bool open_for_write = mode & BLK_OPEN_WRITE;
 	void __user *argp = (void __user *)arg;
+	unsigned int flags = 0;
+
+	if (bdev_is_partition(bdev))
+		flags |= NVME_IOCTL_PARTITION;
 
 	if (is_ctrl_ioctl(cmd))
-		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
-	return nvme_ns_ioctl(ns, cmd, argp);
+		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
+	return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
 }
 
 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct nvme_ns *ns =
 		container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
+	bool open_for_write = file->f_mode & FMODE_WRITE;
 	void __user *argp = (void __user *)arg;
 
 	if (is_ctrl_ioctl(cmd))
-		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
-	return nvme_ns_ioctl(ns, cmd, argp);
+		return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
+	return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
 }
 
 static int nvme_uring_cmd_checks(unsigned int issue_flags)
@@ -716,7 +799,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 }
 #ifdef CONFIG_NVME_MULTIPATH
 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
-		void __user *argp, struct nvme_ns_head *head, int srcu_idx)
+		void __user *argp, struct nvme_ns_head *head, int srcu_idx,
+		bool open_for_write)
 	__releases(&head->srcu)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
@@ -724,7 +808,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 
 	nvme_get_ctrl(ns->ctrl);
 	srcu_read_unlock(&head->srcu, srcu_idx);
-	ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
+	ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write);
 
 	nvme_put_ctrl(ctrl);
 	return ret;
@@ -734,9 +818,14 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
 	struct nvme_ns_head *head = bdev->bd_disk->private_data;
+	bool open_for_write = mode & BLK_OPEN_WRITE;
 	void __user *argp = (void __user *)arg;
 	struct nvme_ns *ns;
 	int srcu_idx, ret = -EWOULDBLOCK;
+	unsigned int flags = 0;
+
+	if (bdev_is_partition(bdev))
+		flags |= NVME_IOCTL_PARTITION;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
@@ -749,9 +838,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
 	 * deadlock when deleting namespaces using the passthrough interface.
 	 */
 	if (is_ctrl_ioctl(cmd))
-		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
+					       open_for_write);
 
-	ret = nvme_ns_ioctl(ns, cmd, argp);
+	ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write);
 out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
@@ -760,6 +850,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg)
 {
+	bool open_for_write = file->f_mode & FMODE_WRITE;
 	struct cdev *cdev = file_inode(file)->i_cdev;
 	struct nvme_ns_head *head =
 		container_of(cdev, struct nvme_ns_head, cdev);
@@ -773,9 +864,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		goto out_unlock;
 
 	if (is_ctrl_ioctl(cmd))
-		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
+				open_for_write);
 
-	ret = nvme_ns_ioctl(ns, cmd, argp);
+	ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write);
 out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
@@ -824,7 +916,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
 	return ret;
 }
 
-static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
+		bool open_for_write)
 {
 	struct nvme_ns *ns;
 	int ret;
@@ -848,7 +941,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 	kref_get(&ns->kref);
 	up_read(&ctrl->namespaces_rwsem);
 
-	ret = nvme_user_cmd(ctrl, ns, argp);
+	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
 	nvme_put_ns(ns);
 	return ret;
 
@@ -860,16 +953,17 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg)
 {
+	bool open_for_write = file->f_mode & FMODE_WRITE;
 	struct nvme_ctrl *ctrl = file->private_data;
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ctrl, NULL, argp);
+		return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp, 0);
+		return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write);
 	case NVME_IOCTL_IO_CMD:
-		return nvme_dev_user_cmd(ctrl, argp);
+		return nvme_dev_user_cmd(ctrl, argp, open_for_write);
 	case NVME_IOCTL_RESET:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index df3e2c619448..26dd3f859d9d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -642,6 +642,7 @@ enum {
 	NVME_CMD_EFFECTS_CCC		= 1 << 4,
 	NVME_CMD_EFFECTS_CSE_MASK	= GENMASK(18, 16),
 	NVME_CMD_EFFECTS_UUID_SEL	= 1 << 19,
+	NVME_CMD_EFFECTS_SCOPE_MASK	= GENMASK(31, 20),
 };
 
 struct nvme_effects_log {
@@ -833,6 +834,7 @@ enum nvme_opcode {
 	nvme_cmd_zone_mgmt_send	= 0x79,
 	nvme_cmd_zone_mgmt_recv	= 0x7a,
 	nvme_cmd_zone_append	= 0x7d,
+	nvme_cmd_vendor_start	= 0x80,
 };
 
 #define nvme_opcode_name(opcode)	{ opcode, #opcode }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands
  2023-10-18 18:30   ` [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands Kanchan Joshi
@ 2023-10-19  5:49     ` Christoph Hellwig
  2023-10-19  9:59       ` Kanchan Joshi
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:49 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: hch, kbusch, axboe, sagi, linux-nvme, gost.dev, joshiiitr

On Thu, Oct 19, 2023 at 12:00:03AM +0530, Kanchan Joshi wrote:
> Passthrough commands are guarded by heavy-handed CAP_SYS_ADMIN checks
> that neglects the file-mode completely.
> 
> Add a fine-granular policy that considers file-mode and various other
> parameters for any approval/denial. Also enable few admin commands that
> are necssary to get the information required to form the i/o commands.
> The policy is implemented in a new helper 'nvme_cmd_allowed'.
> 
> I/O commands needing data-transfers are sent only via SGL.

This looks mostly good to me, but we should not even send the command
if we know SGLs aren't supported.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands
  2023-10-19  5:49     ` Christoph Hellwig
@ 2023-10-19  9:59       ` Kanchan Joshi
  0 siblings, 0 replies; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-19  9:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kbusch, axboe, sagi, linux-nvme, gost.dev, joshiiitr

On 10/19/2023 11:19 AM, Christoph Hellwig wrote:
> On Thu, Oct 19, 2023 at 12:00:03AM +0530, Kanchan Joshi wrote:
>> Passthrough commands are guarded by heavy-handed CAP_SYS_ADMIN checks
>> that neglects the file-mode completely.
>>
>> Add a fine-granular policy that considers file-mode and various other
>> parameters for any approval/denial. Also enable few admin commands that
>> are necssary to get the information required to form the i/o commands.
>> The policy is implemented in a new helper 'nvme_cmd_allowed'.
>>
>> I/O commands needing data-transfers are sent only via SGL.
> 
> This looks mostly good to me, but we should not even send the command
> if we know SGLs aren't supported.
> 

I can do this here (and kill the checks from pcie)-

@@ -134,6 +206,15 @@ static int nvme_map_user_request(struct request 
*req, u64 ubuffer,
                 }
                 *metap = meta;
         }
+       /* Data/Meta transfer for unprivileged passthrough is only via 
SGL */
+       if (bdev && !capable(CAP_SYS_ADMIN)) {
+               if (!nvme_ctrl_sgl_supported(ns->ctrl) ||
+                   (*metap && !nvme_ctrl_meta_sgl_supported(ns->ctrl))) {
+                       ret = -EINVAL;
+                       goto out_unmap;
+               }
+               nvme_req(req)->flags |= NVME_REQ_FORCE_SGL;
+       }


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 18:30 ` [PATCH 0/2] Unprivileged sgl-only passthrough Kanchan Joshi
  2023-10-18 18:30   ` [PATCH 1/2] nvme-pci: meta-transfer via sgl Kanchan Joshi
  2023-10-18 18:30   ` [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands Kanchan Joshi
@ 2023-10-18 18:40   ` Jens Axboe
  2023-10-18 19:06     ` Kanchan Joshi
  2023-10-18 19:35     ` Keith Busch
  2 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2023-10-18 18:40 UTC (permalink / raw)
  To: Kanchan Joshi, hch, kbusch, sagi; +Cc: linux-nvme, gost.dev, joshiiitr

On 10/18/23 12:30 PM, Kanchan Joshi wrote:
> Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
> Patch 2: Enables fine-granular passthrough with the change that i/o
> commands can transfer the data only via SGL.
> 
> Requirement:
> - Prepared against block 6.6 tree.
> - The patch in uring-passthrough failure handling is required to see the
>   submission failure (if any)
>   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/

I didn't have time to follow the previous discussion, but what's the
reasoning behind allowing it for SGL only? IIRC, we do have an inline
vec for a small number of vecs, so presumably this would not hit
alloc+free for each IO? But even so, I would imagine that SGL is slower
than PRP? Do we know how much?

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 18:40   ` [PATCH 0/2] Unprivileged sgl-only passthrough Jens Axboe
@ 2023-10-18 19:06     ` Kanchan Joshi
  2023-10-18 19:12       ` Jens Axboe
  2023-10-18 19:35     ` Keith Busch
  1 sibling, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 19:06 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Kanchan Joshi, hch, kbusch, sagi, linux-nvme, gost.dev

On Thu, Oct 19, 2023 at 12:10 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 10/18/23 12:30 PM, Kanchan Joshi wrote:
> > Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
> > Patch 2: Enables fine-granular passthrough with the change that i/o
> > commands can transfer the data only via SGL.
> >
> > Requirement:
> > - Prepared against block 6.6 tree.
> > - The patch in uring-passthrough failure handling is required to see the
> >   submission failure (if any)
> >   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
>
> I didn't have time to follow the previous discussion, but what's the
> reasoning behind allowing it for SGL only?

This was a solution that emerged while discussing how best to fill the
DMA corruption hole for passthrough.
With SGL, the buffer length (data/buffer) sanity checks are done by
the SSD and it fails the IO rather than doing extra transfer.

> IIRC, we do have an inline
> vec for a small number of vecs, so presumably this would not hit
> alloc+free for each IO?

16b dma_pool_alloc/free for each IO that involves metadata. This is to
keep the nvme-sgl that points to the metadata buffer.
Hopefully some ideas can emerge (during the review) to see if we can
do away with it.

>But even so, I would imagine that SGL is slower
> than PRP? Do we know how much?

I do not know at the moment. Plan is to evaluate this soon.

BTW, SGL-only mode is for unprivileged users only. For root, it
remains the same as before (prp or sgl depending on the data-transfer
length).

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:06     ` Kanchan Joshi
@ 2023-10-18 19:12       ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2023-10-18 19:12 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: Kanchan Joshi, hch, kbusch, sagi, linux-nvme, gost.dev

On 10/18/23 1:06 PM, Kanchan Joshi wrote:
> On Thu, Oct 19, 2023 at 12:10?AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 10/18/23 12:30 PM, Kanchan Joshi wrote:
>>> Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
>>> Patch 2: Enables fine-granular passthrough with the change that i/o
>>> commands can transfer the data only via SGL.
>>>
>>> Requirement:
>>> - Prepared against block 6.6 tree.
>>> - The patch in uring-passthrough failure handling is required to see the
>>>   submission failure (if any)
>>>   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
>>
>> I didn't have time to follow the previous discussion, but what's the
>> reasoning behind allowing it for SGL only?
> 
> This was a solution that emerged while discussing how best to fill the
> DMA corruption hole for passthrough.
> With SGL, the buffer length (data/buffer) sanity checks are done by
> the SSD and it fails the IO rather than doing extra transfer.

Yay hardware...

>> IIRC, we do have an inline
>> vec for a small number of vecs, so presumably this would not hit
>> alloc+free for each IO?
> 
> 16b dma_pool_alloc/free for each IO that involves metadata. This is to
> keep the nvme-sgl that points to the metadata buffer.
> Hopefully some ideas can emerge (during the review) to see if we can
> do away with it.

OK, so at least nothing if meta data isn't being used. I know of at
least one use case for meta data and passthrough, so would be nice to at
least have an eye on making that situation better.

>> But even so, I would imagine that SGL is slower
>> than PRP? Do we know how much?
> 
> I do not know at the moment. Plan is to evaluate this soon.
> 
> BTW, SGL-only mode is for unprivileged users only. For root, it
> remains the same as before (prp or sgl depending on the data-transfer
> length).

That's nice at least.

Thanks for the clarifications.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 18:40   ` [PATCH 0/2] Unprivileged sgl-only passthrough Jens Axboe
  2023-10-18 19:06     ` Kanchan Joshi
@ 2023-10-18 19:35     ` Keith Busch
  2023-10-18 19:37       ` Jens Axboe
                         ` (2 more replies)
  1 sibling, 3 replies; 18+ messages in thread
From: Keith Busch @ 2023-10-18 19:35 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Kanchan Joshi, hch, sagi, linux-nvme, gost.dev, joshiiitr

On Wed, Oct 18, 2023 at 12:40:35PM -0600, Jens Axboe wrote:
> On 10/18/23 12:30 PM, Kanchan Joshi wrote:
> > Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
> > Patch 2: Enables fine-granular passthrough with the change that i/o
> > commands can transfer the data only via SGL.
> > 
> > Requirement:
> > - Prepared against block 6.6 tree.
> > - The patch in uring-passthrough failure handling is required to see the
> >   submission failure (if any)
> >   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
> 
> I didn't have time to follow the previous discussion, but what's the
> reasoning behind allowing it for SGL only? IIRC, we do have an inline
> vec for a small number of vecs, so presumably this would not hit
> alloc+free for each IO? But even so, I would imagine that SGL is slower
> than PRP? Do we know how much?

SGL for metadata is definitely slower, but it's the only nvme protocol
way to directly specify how much memory is actually available for the
command's transfer. PRP/MPTR vs SGL is like strcpy() vs strncpy().

Similiar to Kanchan's earlier experience though, I haven't found real
nvme devices that support the SGL mode for metadata. The scenarios this
enables might be pretty limited. :(

The other hardware "solution" is turn on your IOMMU (eww).


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:35     ` Keith Busch
@ 2023-10-18 19:37       ` Jens Axboe
  2023-10-18 19:44       ` Kanchan Joshi
  2023-10-18 19:59       ` Kanchan Joshi
  2 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2023-10-18 19:37 UTC (permalink / raw)
  To: Keith Busch; +Cc: Kanchan Joshi, hch, sagi, linux-nvme, gost.dev, joshiiitr

On 10/18/23 1:35 PM, Keith Busch wrote:
> On Wed, Oct 18, 2023 at 12:40:35PM -0600, Jens Axboe wrote:
>> On 10/18/23 12:30 PM, Kanchan Joshi wrote:
>>> Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
>>> Patch 2: Enables fine-granular passthrough with the change that i/o
>>> commands can transfer the data only via SGL.
>>>
>>> Requirement:
>>> - Prepared against block 6.6 tree.
>>> - The patch in uring-passthrough failure handling is required to see the
>>>   submission failure (if any)
>>>   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
>>
>> I didn't have time to follow the previous discussion, but what's the
>> reasoning behind allowing it for SGL only? IIRC, we do have an inline
>> vec for a small number of vecs, so presumably this would not hit
>> alloc+free for each IO? But even so, I would imagine that SGL is slower
>> than PRP? Do we know how much?
> 
> SGL for metadata is definitely slower, but it's the only nvme protocol
> way to directly specify how much memory is actually available for the
> command's transfer. PRP/MPTR vs SGL is like strcpy() vs strncpy().

So... is this a metadata only issue? Or does it apply to any read/write,
metadata or not?

> Similiar to Kanchan's earlier experience though, I haven't found real
> nvme devices that support the SGL mode for metadata. The scenarios this
> enables might be pretty limited. :(

Well that certainly makes it way less useful.

> The other hardware "solution" is turn on your IOMMU (eww).

Yep, certainly also a way to pay IO taxes.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:35     ` Keith Busch
  2023-10-18 19:37       ` Jens Axboe
@ 2023-10-18 19:44       ` Kanchan Joshi
  2023-10-18 19:47         ` Jens Axboe
  2023-10-18 19:59       ` Kanchan Joshi
  2 siblings, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 19:44 UTC (permalink / raw)
  To: Keith Busch; +Cc: Jens Axboe, Kanchan Joshi, hch, sagi, linux-nvme, gost.dev

On Thu, Oct 19, 2023 at 1:05 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Wed, Oct 18, 2023 at 12:40:35PM -0600, Jens Axboe wrote:
> > On 10/18/23 12:30 PM, Kanchan Joshi wrote:
> > > Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
> > > Patch 2: Enables fine-granular passthrough with the change that i/o
> > > commands can transfer the data only via SGL.
> > >
> > > Requirement:
> > > - Prepared against block 6.6 tree.
> > > - The patch in uring-passthrough failure handling is required to see the
> > >   submission failure (if any)
> > >   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
> >
> > I didn't have time to follow the previous discussion, but what's the
> > reasoning behind allowing it for SGL only? IIRC, we do have an inline
> > vec for a small number of vecs, so presumably this would not hit
> > alloc+free for each IO? But even so, I would imagine that SGL is slower
> > than PRP? Do we know how much?
>
> SGL for metadata is definitely slower, but it's the only nvme protocol
> way to directly specify how much memory is actually available for the
> command's transfer. PRP/MPTR vs SGL is like strcpy() vs strncpy().
>
> Similiar to Kanchan's earlier experience though, I haven't found real
> nvme devices that support the SGL mode for metadata. The scenarios this
> enables might be pretty limited. :(

What I found later was that - all the enterprise drives (of samsung)
support SGL for data/meta.
I have tested this code on one such SSD and on QEMU.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:44       ` Kanchan Joshi
@ 2023-10-18 19:47         ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2023-10-18 19:47 UTC (permalink / raw)
  To: Kanchan Joshi, Keith Busch; +Cc: Kanchan Joshi, hch, sagi, linux-nvme, gost.dev

On 10/18/23 1:44 PM, Kanchan Joshi wrote:
> On Thu, Oct 19, 2023 at 1:05?AM Keith Busch <kbusch@kernel.org> wrote:
>>
>> On Wed, Oct 18, 2023 at 12:40:35PM -0600, Jens Axboe wrote:
>>> On 10/18/23 12:30 PM, Kanchan Joshi wrote:
>>>> Patch 1: Prep. Adds the meta-transfer ability in nvme-pci
>>>> Patch 2: Enables fine-granular passthrough with the change that i/o
>>>> commands can transfer the data only via SGL.
>>>>
>>>> Requirement:
>>>> - Prepared against block 6.6 tree.
>>>> - The patch in uring-passthrough failure handling is required to see the
>>>>   submission failure (if any)
>>>>   https://lore.kernel.org/linux-nvme/20231018135718.28820-1-joshi.k@samsung.com/
>>>
>>> I didn't have time to follow the previous discussion, but what's the
>>> reasoning behind allowing it for SGL only? IIRC, we do have an inline
>>> vec for a small number of vecs, so presumably this would not hit
>>> alloc+free for each IO? But even so, I would imagine that SGL is slower
>>> than PRP? Do we know how much?
>>
>> SGL for metadata is definitely slower, but it's the only nvme protocol
>> way to directly specify how much memory is actually available for the
>> command's transfer. PRP/MPTR vs SGL is like strcpy() vs strncpy().
>>
>> Similiar to Kanchan's earlier experience though, I haven't found real
>> nvme devices that support the SGL mode for metadata. The scenarios this
>> enables might be pretty limited. :(
> 
> What I found later was that - all the enterprise drives (of samsung)
> support SGL for data/meta.
> I have tested this code on one such SSD and on QEMU.

Maybe a moot point given the status of the product, but it does look
like the gen2 optanes at least do support SGL for metadata.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:35     ` Keith Busch
  2023-10-18 19:37       ` Jens Axboe
  2023-10-18 19:44       ` Kanchan Joshi
@ 2023-10-18 19:59       ` Kanchan Joshi
  2023-10-18 21:06         ` Keith Busch
  2 siblings, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2023-10-18 19:59 UTC (permalink / raw)
  To: Keith Busch; +Cc: Jens Axboe, Kanchan Joshi, hch, sagi, linux-nvme, gost.dev

On Thu, Oct 19, 2023 at 1:05 AM Keith Busch <kbusch@kernel.org> wrote:
> SGL for metadata is definitely slower, but it's the only nvme protocol
> way to directly specify how much memory is actually available for the
> command's transfer. PRP/MPTR vs SGL is like strcpy() vs strncpy().
>
> Similiar to Kanchan's earlier experience though, I haven't found real
> nvme devices that support the SGL mode for metadata. The scenarios this
> enables might be pretty limited. :(

And if not this, what should be the solution to have non-root
passthrough? This is the third one (excluding iommu) that we are
discussing.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 19:59       ` Kanchan Joshi
@ 2023-10-18 21:06         ` Keith Busch
  2023-10-18 21:08           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Keith Busch @ 2023-10-18 21:06 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: Jens Axboe, Kanchan Joshi, hch, sagi, linux-nvme, gost.dev

On Thu, Oct 19, 2023 at 01:29:46AM +0530, Kanchan Joshi wrote:
> And if not this, what should be the solution to have non-root
> passthrough?

I agree we need non-root passthrough.

Could we restore what we previously had, but fence it off with a module
parameter to opt-in to allow it? Like setting a silicon chicken bit, and
taint the kernel if that helps convey the responsibility taken with such
a parameter?

The only other possibility I can think of is start a command allow-list
specific to non-root users, along with your earlier patch's decoder.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 0/2] Unprivileged sgl-only passthrough
  2023-10-18 21:06         ` Keith Busch
@ 2023-10-18 21:08           ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2023-10-18 21:08 UTC (permalink / raw)
  To: Keith Busch, Kanchan Joshi; +Cc: Kanchan Joshi, hch, sagi, linux-nvme, gost.dev

On 10/18/23 3:06 PM, Keith Busch wrote:
> On Thu, Oct 19, 2023 at 01:29:46AM +0530, Kanchan Joshi wrote:
>> And if not this, what should be the solution to have non-root
>> passthrough?
> 
> I agree we need non-root passthrough.
> 
> Could we restore what we previously had, but fence it off with a module
> parameter to opt-in to allow it? Like setting a silicon chicken bit, and
> taint the kernel if that helps convey the responsibility taken with such
> a parameter?

Let's please not do a module parameter, those are just awful to deal
with. I'd much rather see a per-drive toggle for this, which whatever
list could then use. That'd allow you to open the device as root, toggle
the switch per-drive, and then drop caps if that is what your
application does.

I'm going to drop the 6.6 pending bits as, to me, it all seems really
half assed and rushed. Let's just do this right for 6.7, it'll need
backporting anyway.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-10-20  4:40 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20231018183620epcas5p26ab74bdd1f2739ef3ec1ee2431329dc4@epcas5p2.samsung.com>
2023-10-18 18:30 ` [PATCH 0/2] Unprivileged sgl-only passthrough Kanchan Joshi
2023-10-18 18:30   ` [PATCH 1/2] nvme-pci: meta-transfer via sgl Kanchan Joshi
2023-10-19  5:48     ` Christoph Hellwig
2023-10-19  9:54       ` Kanchan Joshi
2023-10-20  4:40         ` Christoph Hellwig
2023-10-18 18:30   ` [PATCH 2/2] nvme: fine-granular CAP_SYS_ADMIN for nvme io/admin commands Kanchan Joshi
2023-10-19  5:49     ` Christoph Hellwig
2023-10-19  9:59       ` Kanchan Joshi
2023-10-18 18:40   ` [PATCH 0/2] Unprivileged sgl-only passthrough Jens Axboe
2023-10-18 19:06     ` Kanchan Joshi
2023-10-18 19:12       ` Jens Axboe
2023-10-18 19:35     ` Keith Busch
2023-10-18 19:37       ` Jens Axboe
2023-10-18 19:44       ` Kanchan Joshi
2023-10-18 19:47         ` Jens Axboe
2023-10-18 19:59       ` Kanchan Joshi
2023-10-18 21:06         ` Keith Busch
2023-10-18 21:08           ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox