* [PATCH] nvme-tcp: print correct opcode on timeout and error handling
@ 2023-03-07 16:30 Akinobu Mita
2023-03-08 11:06 ` Sagi Grimberg
0 siblings, 1 reply; 9+ messages in thread
From: Akinobu Mita @ 2023-03-07 16:30 UTC (permalink / raw)
To: linux-nvme
Cc: Akinobu Mita, Sagi Grimberg, Chaitanya Kulkarni,
Christoph Hellwig, Keith Busch, Jens Axboe, Hannes Reinecke
In timeout and error handling, various information is reported about the
command in error, but the printed opcode may not be correct
The opcode is obtained from the nvme_command structure referenced by the
'cmd' member in the nvme_request structure.
(i.e. nvme_req(req)->cmd->common.opcode)
For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
points to a structure within the page fragment allocated by
nvme_tcp_init_request(). This page fragment is used as a command capsule
PDU, and then may be reused as a h2c data PDU, so the nvme_command
referenced by the 'cmd' member has already been overwritten.
To fix this problem, when setting up the nvme_command, keep the opcode in
a newly added member in the nvme_tcp_request and use that instead.
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Chaitanya Kulkarni <kch@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
---
drivers/nvme/host/core.c | 10 ++++++----
drivers/nvme/host/nvme.h | 11 +++++++++++
drivers/nvme/host/tcp.c | 15 ++++++++++++++-
3 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2730b116dc6..a86f787ba3bd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -310,12 +310,14 @@ static void nvme_log_error(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
struct nvme_request *nr = nvme_req(req);
+ u8 opcode;
+ nvme_get_request_info(req, &opcode);
if (ns) {
pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
ns->disk ? ns->disk->disk_name : "?",
- nvme_get_opcode_str(nr->cmd->common.opcode),
- nr->cmd->common.opcode,
+ nvme_get_opcode_str(opcode),
+ opcode,
(unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
(unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
nvme_get_error_status_str(nr->status),
@@ -328,8 +330,8 @@ static void nvme_log_error(struct request *req)
pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
dev_name(nr->ctrl->device),
- nvme_get_admin_opcode_str(nr->cmd->common.opcode),
- nr->cmd->common.opcode,
+ nvme_get_admin_opcode_str(opcode),
+ opcode,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
nr->status & 0xff, /* Status Code */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1..bbb458323e91 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -525,6 +525,7 @@ struct nvme_ctrl_ops {
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*print_device_info)(struct nvme_ctrl *ctrl);
bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
+ void (*get_request_info)(struct request *req, u8 *opcode);
};
/*
@@ -597,6 +598,16 @@ static inline void nvme_print_device_info(struct nvme_ctrl *ctrl)
subsys->firmware_rev);
}
+static inline void nvme_get_request_info(struct request *rq, u8 *opcode)
+{
+ struct nvme_request *req = nvme_req(rq);
+
+ if (req->ctrl->ops->get_request_info)
+ req->ctrl->ops->get_request_info(rq, opcode);
+ else
+ *opcode = req->cmd->common.opcode;
+}
+
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
const char *dev_name);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 7723a4989524..a6c0ae51d830 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -90,6 +90,8 @@ struct nvme_tcp_request {
struct list_head entry;
struct llist_node lentry;
__le32 ddgst;
+ u8 opcode;
+ u8 fctype;
struct bio *curr_bio;
struct iov_iter iter;
@@ -2285,7 +2287,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
- u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
+ u8 opc = req->opcode, fctype = req->fctype;
int qid = nvme_tcp_queue_id(req->queue);
dev_warn(ctrl->device,
@@ -2354,6 +2356,9 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
req->state = NVME_TCP_SEND_CMD_PDU;
req->status = cpu_to_le16(NVME_SC_SUCCESS);
+ req->opcode = pdu->cmd.common.opcode;
+ if (req->opcode == nvme_fabrics_command)
+ req->fctype = pdu->cmd.fabrics.fctype;
req->offset = 0;
req->data_sent = 0;
req->pdu_len = 0;
@@ -2509,6 +2514,13 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
return len;
}
+static void nvme_tcp_get_request_info(struct request *rq, u8 *opcode)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+ *opcode = req->opcode;
+}
+
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs,
@@ -2542,6 +2554,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
+ .get_request_info = nvme_tcp_get_request_info,
};
static bool
--
2.34.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-07 16:30 [PATCH] nvme-tcp: print correct opcode on timeout and error handling Akinobu Mita
@ 2023-03-08 11:06 ` Sagi Grimberg
2023-03-08 15:31 ` Akinobu Mita
2023-03-08 23:51 ` Chaitanya Kulkarni
0 siblings, 2 replies; 9+ messages in thread
From: Sagi Grimberg @ 2023-03-08 11:06 UTC (permalink / raw)
To: Akinobu Mita, linux-nvme
Cc: Chaitanya Kulkarni, Christoph Hellwig, Keith Busch, Jens Axboe,
Hannes Reinecke
> In timeout and error handling, various information is reported about the
> command in error, but the printed opcode may not be correct
>
> The opcode is obtained from the nvme_command structure referenced by the
> 'cmd' member in the nvme_request structure.
> (i.e. nvme_req(req)->cmd->common.opcode)
>
> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
> points to a structure within the page fragment allocated by
> nvme_tcp_init_request(). This page fragment is used as a command capsule
> PDU, and then may be reused as a h2c data PDU, so the nvme_command
> referenced by the 'cmd' member has already been overwritten.
>
> To fix this problem, when setting up the nvme_command, keep the opcode in
> a newly added member in the nvme_tcp_request and use that instead.
I'm wandering if we should just not reuse the cmd pdu for a subsequent
data pdu...
It will take more space allocated per request (up to a few MBs for lots
of requests and controllers)...
but will prevent from propagating this to nvme core...
Something like the below (untested):
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index ef27122acce6..ffbcd5c7cb26 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -207,6 +207,16 @@ static inline u8 nvme_tcp_ddgst_len(struct
nvme_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
+static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
+{
+ return req->pdu;
+}
+
+static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
+{
+ return req->pdu + sizeof(struct nvme_tcp_cmd_pdu);
+}
+
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request
*req)
{
if (nvme_is_fabrics(req->req.cmd))
@@ -470,7 +480,8 @@ static int nvme_tcp_init_request(struct
blk_mq_tag_set *set,
u8 hdgst = nvme_tcp_hdgst_len(queue);
req->pdu = page_frag_alloc(&queue->pf_cache,
- sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+ sizeof(struct nvme_tcp_cmd_pdu) +
+ sizeof(struct nvme_tcp_data_pdu) + hdgst,
GFP_KERNEL | __GFP_ZERO);
if (!req->pdu)
return -ENOMEM;
@@ -613,7 +624,7 @@ static int nvme_tcp_handle_comp(struct
nvme_tcp_queue *queue,
static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
{
- struct nvme_tcp_data_pdu *data = req->pdu;
+ struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
struct request *rq = blk_mq_rq_from_pdu(req);
u32 h2cdata_sent = req->pdu_len;
@@ -1035,7 +1046,7 @@ static int nvme_tcp_try_send_data(struct
nvme_tcp_request *req)
static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
bool inline_data = nvme_tcp_has_inline_data(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
@@ -1074,7 +1085,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct
nvme_tcp_request *req)
static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
- struct nvme_tcp_data_pdu *pdu = req->pdu;
+ struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) - req->offset + hdgst;
int ret;
@@ -2281,7 +2292,7 @@ static enum blk_eh_timer_return
nvme_tcp_timeout(struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
int qid = nvme_tcp_queue_id(req->queue);
@@ -2320,7 +2331,7 @@ static blk_status_t nvme_tcp_map_data(struct
nvme_tcp_queue *queue,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_command *c = &pdu->cmd;
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2340,7 +2351,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct
nvme_ns *ns,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
blk_status_t ret;
--
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-08 11:06 ` Sagi Grimberg
@ 2023-03-08 15:31 ` Akinobu Mita
2023-03-08 23:51 ` Chaitanya Kulkarni
1 sibling, 0 replies; 9+ messages in thread
From: Akinobu Mita @ 2023-03-08 15:31 UTC (permalink / raw)
To: Sagi Grimberg
Cc: linux-nvme, Chaitanya Kulkarni, Christoph Hellwig, Keith Busch,
Jens Axboe, Hannes Reinecke
2023年3月8日(水) 20:06 Sagi Grimberg <sagi@grimberg.me>:
>
>
> > In timeout and error handling, various information is reported about the
> > command in error, but the printed opcode may not be correct
> >
> > The opcode is obtained from the nvme_command structure referenced by the
> > 'cmd' member in the nvme_request structure.
> > (i.e. nvme_req(req)->cmd->common.opcode)
> >
> > For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
> > points to a structure within the page fragment allocated by
> > nvme_tcp_init_request(). This page fragment is used as a command capsule
> > PDU, and then may be reused as a h2c data PDU, so the nvme_command
> > referenced by the 'cmd' member has already been overwritten.
> >
> > To fix this problem, when setting up the nvme_command, keep the opcode in
> > a newly added member in the nvme_tcp_request and use that instead.
>
> I'm wandering if we should just not reuse the cmd pdu for a subsequent
> data pdu...
>
> It will take more space allocated per request (up to a few MBs for lots
> of requests and controllers)...
>
> but will prevent from propagating this to nvme core...
>
> Something like the below (untested):
Your fix looks good. It actually fixed the problem as well.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-08 11:06 ` Sagi Grimberg
2023-03-08 15:31 ` Akinobu Mita
@ 2023-03-08 23:51 ` Chaitanya Kulkarni
2023-03-08 23:57 ` Keith Busch
1 sibling, 1 reply; 9+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-08 23:51 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Chaitanya Kulkarni, Christoph Hellwig, Keith Busch, Jens Axboe,
linux-nvme@lists.infradead.org, Akinobu Mita, Hannes Reinecke
Sagi,
On 3/8/23 03:06, Sagi Grimberg wrote:
>
>> In timeout and error handling, various information is reported about the
>> command in error, but the printed opcode may not be correct
>>
>> The opcode is obtained from the nvme_command structure referenced by the
>> 'cmd' member in the nvme_request structure.
>> (i.e. nvme_req(req)->cmd->common.opcode)
>>
>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
>> points to a structure within the page fragment allocated by
>> nvme_tcp_init_request(). This page fragment is used as a command
>> capsule
>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
>> referenced by the 'cmd' member has already been overwritten.
>>
>> To fix this problem, when setting up the nvme_command, keep the
>> opcode in
>> a newly added member in the nvme_tcp_request and use that instead.
>
> I'm wandering if we should just not reuse the cmd pdu for a subsequent
> data pdu...
>
> It will take more space allocated per request (up to a few MBs for lots
> of requests and controllers)...
>
> but will prevent from propagating this to nvme core...
>
Is there a way we can do this without increasing the memory usage ?
-ck
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-08 23:51 ` Chaitanya Kulkarni
@ 2023-03-08 23:57 ` Keith Busch
2023-03-09 1:46 ` Chaitanya Kulkarni
2023-03-09 9:10 ` Sagi Grimberg
0 siblings, 2 replies; 9+ messages in thread
From: Keith Busch @ 2023-03-08 23:57 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: Sagi Grimberg, Christoph Hellwig, Jens Axboe,
linux-nvme@lists.infradead.org, Akinobu Mita, Hannes Reinecke
On Wed, Mar 08, 2023 at 11:51:21PM +0000, Chaitanya Kulkarni wrote:
> Sagi,
>
> On 3/8/23 03:06, Sagi Grimberg wrote:
> >
> >> In timeout and error handling, various information is reported about the
> >> command in error, but the printed opcode may not be correct
> >>
> >> The opcode is obtained from the nvme_command structure referenced by the
> >> 'cmd' member in the nvme_request structure.
> >> (i.e. nvme_req(req)->cmd->common.opcode)
> >>
> >> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
> >> points to a structure within the page fragment allocated by
> >> nvme_tcp_init_request(). This page fragment is used as a command
> >> capsule
> >> PDU, and then may be reused as a h2c data PDU, so the nvme_command
> >> referenced by the 'cmd' member has already been overwritten.
> >>
> >> To fix this problem, when setting up the nvme_command, keep the
> >> opcode in
> >> a newly added member in the nvme_tcp_request and use that instead.
> >
> > I'm wandering if we should just not reuse the cmd pdu for a subsequent
> > data pdu...
> >
> > It will take more space allocated per request (up to a few MBs for lots
> > of requests and controllers)...
> >
> > but will prevent from propagating this to nvme core...
> >
>
> Is there a way we can do this without increasing the memory usage ?
The data pdu has to go somewhere. There doesn't appear to be another free area
other than the command pdu, so if that's off limits to preserve the original
sqe, then more memory is needed.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-08 23:57 ` Keith Busch
@ 2023-03-09 1:46 ` Chaitanya Kulkarni
2023-03-09 9:10 ` Sagi Grimberg
1 sibling, 0 replies; 9+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-09 1:46 UTC (permalink / raw)
To: Keith Busch
Cc: Sagi Grimberg, Christoph Hellwig, Jens Axboe,
linux-nvme@lists.infradead.org, Akinobu Mita, Hannes Reinecke
>>>> To fix this problem, when setting up the nvme_command, keep the
>>>> opcode in
>>>> a newly added member in the nvme_tcp_request and use that instead.
>>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
>>> data pdu...
>>>
>>> It will take more space allocated per request (up to a few MBs for lots
>>> of requests and controllers)...
>>>
>>> but will prevent from propagating this to nvme core...
>>>
>> Is there a way we can do this without increasing the memory usage ?
> The data pdu has to go somewhere. There doesn't appear to be another free area
> other than the command pdu, so if that's off limits to preserve the original
> sqe, then more memory is needed.
hmmm I guess there is not option then ...
-ck
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-08 23:57 ` Keith Busch
2023-03-09 1:46 ` Chaitanya Kulkarni
@ 2023-03-09 9:10 ` Sagi Grimberg
2023-03-12 12:56 ` Akinobu Mita
1 sibling, 1 reply; 9+ messages in thread
From: Sagi Grimberg @ 2023-03-09 9:10 UTC (permalink / raw)
To: Keith Busch, Chaitanya Kulkarni
Cc: Christoph Hellwig, Jens Axboe, linux-nvme@lists.infradead.org,
Akinobu Mita, Hannes Reinecke
>> Sagi,
>>
>> On 3/8/23 03:06, Sagi Grimberg wrote:
>>>
>>>> In timeout and error handling, various information is reported about the
>>>> command in error, but the printed opcode may not be correct
>>>>
>>>> The opcode is obtained from the nvme_command structure referenced by the
>>>> 'cmd' member in the nvme_request structure.
>>>> (i.e. nvme_req(req)->cmd->common.opcode)
>>>>
>>>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
>>>> points to a structure within the page fragment allocated by
>>>> nvme_tcp_init_request(). This page fragment is used as a command
>>>> capsule
>>>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
>>>> referenced by the 'cmd' member has already been overwritten.
>>>>
>>>> To fix this problem, when setting up the nvme_command, keep the
>>>> opcode in
>>>> a newly added member in the nvme_tcp_request and use that instead.
>>>
>>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
>>> data pdu...
>>>
>>> It will take more space allocated per request (up to a few MBs for lots
>>> of requests and controllers)...
>>>
>>> but will prevent from propagating this to nvme core...
>>>
>>
>> Is there a way we can do this without increasing the memory usage ?
>
> The data pdu has to go somewhere. There doesn't appear to be another free area
> other than the command pdu, so if that's off limits to preserve the original
> sqe, then more memory is needed.
If I take a somewhat extreme calculation, where a host has 16
controllers, 128 queues of depth 128 for each I end up with additional
6MB, which is not insignificant for a driver...
We could go with the original proposal of storing just the opcode, but
it then propagates to the core...
Or...
The cmd pdu has a size of 72 bytes (header + 64 byte command), perhaps
we can still reuse the same PDU space but rather take the last 24 bytes
of the cmd pdu?
Something like:
--
+static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
+{
+ return req->pdu;
+}
+
+static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
+{
+ /* use the pdu space in the back for the data pdu */
+ return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
+ sizeof(struct nvme_tcp_data_pdu);
+}
+
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request
*req)
{
if (nvme_is_fabrics(req->req.cmd))
@@ -613,7 +625,7 @@ static int nvme_tcp_handle_comp(struct
nvme_tcp_queue *queue,
static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
{
- struct nvme_tcp_data_pdu *data = req->pdu;
+ struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
struct request *rq = blk_mq_rq_from_pdu(req);
u32 h2cdata_sent = req->pdu_len;
@@ -1035,7 +1047,7 @@ static int nvme_tcp_try_send_data(struct
nvme_tcp_request *req)
static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
bool inline_data = nvme_tcp_has_inline_data(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
@@ -1074,7 +1086,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct
nvme_tcp_request *req)
static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
- struct nvme_tcp_data_pdu *pdu = req->pdu;
+ struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) - req->offset + hdgst;
int ret;
@@ -2281,7 +2293,7 @@ static enum blk_eh_timer_return
nvme_tcp_timeout(struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
int qid = nvme_tcp_queue_id(req->queue);
@@ -2320,7 +2332,7 @@ static blk_status_t nvme_tcp_map_data(struct
nvme_tcp_queue *queue,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_command *c = &pdu->cmd;
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2340,7 +2352,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct
nvme_ns *ns,
struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_tcp_queue *queue = req->queue;
u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
blk_status_t ret;
--
This way we can still abuse the pdu space without allocating additional
24 bytes per request, and live with it until it bothers something else?
Thoughts?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-09 9:10 ` Sagi Grimberg
@ 2023-03-12 12:56 ` Akinobu Mita
2023-03-13 8:54 ` Sagi Grimberg
0 siblings, 1 reply; 9+ messages in thread
From: Akinobu Mita @ 2023-03-12 12:56 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Keith Busch, Chaitanya Kulkarni, Christoph Hellwig, Jens Axboe,
linux-nvme@lists.infradead.org, Hannes Reinecke
2023年3月9日(木) 18:10 Sagi Grimberg <sagi@grimberg.me>:
>
>
> >> Sagi,
> >>
> >> On 3/8/23 03:06, Sagi Grimberg wrote:
> >>>
> >>>> In timeout and error handling, various information is reported about the
> >>>> command in error, but the printed opcode may not be correct
> >>>>
> >>>> The opcode is obtained from the nvme_command structure referenced by the
> >>>> 'cmd' member in the nvme_request structure.
> >>>> (i.e. nvme_req(req)->cmd->common.opcode)
> >>>>
> >>>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
> >>>> points to a structure within the page fragment allocated by
> >>>> nvme_tcp_init_request(). This page fragment is used as a command
> >>>> capsule
> >>>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
> >>>> referenced by the 'cmd' member has already been overwritten.
> >>>>
> >>>> To fix this problem, when setting up the nvme_command, keep the
> >>>> opcode in
> >>>> a newly added member in the nvme_tcp_request and use that instead.
> >>>
> >>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
> >>> data pdu...
> >>>
> >>> It will take more space allocated per request (up to a few MBs for lots
> >>> of requests and controllers)...
> >>>
> >>> but will prevent from propagating this to nvme core...
> >>>
> >>
> >> Is there a way we can do this without increasing the memory usage ?
> >
> > The data pdu has to go somewhere. There doesn't appear to be another free area
> > other than the command pdu, so if that's off limits to preserve the original
> > sqe, then more memory is needed.
>
> If I take a somewhat extreme calculation, where a host has 16
> controllers, 128 queues of depth 128 for each I end up with additional
> 6MB, which is not insignificant for a driver...
>
> We could go with the original proposal of storing just the opcode, but
> it then propagates to the core...
>
> Or...
>
> The cmd pdu has a size of 72 bytes (header + 64 byte command), perhaps
> we can still reuse the same PDU space but rather take the last 24 bytes
> of the cmd pdu?
This also fixed the issue.
Would adding the following static check help avoid unintended changes
in the future?
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) <=
offsetof(struct nvme_tcp_cmd_pdu, cmd.fabrics.fctype) +
sizeof(struct nvme_tcp_data_pdu));
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-tcp: print correct opcode on timeout and error handling
2023-03-12 12:56 ` Akinobu Mita
@ 2023-03-13 8:54 ` Sagi Grimberg
0 siblings, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2023-03-13 8:54 UTC (permalink / raw)
To: Akinobu Mita
Cc: Keith Busch, Chaitanya Kulkarni, Christoph Hellwig, Jens Axboe,
linux-nvme@lists.infradead.org, Hannes Reinecke
>>>> Sagi,
>>>>
>>>> On 3/8/23 03:06, Sagi Grimberg wrote:
>>>>>
>>>>>> In timeout and error handling, various information is reported about the
>>>>>> command in error, but the printed opcode may not be correct
>>>>>>
>>>>>> The opcode is obtained from the nvme_command structure referenced by the
>>>>>> 'cmd' member in the nvme_request structure.
>>>>>> (i.e. nvme_req(req)->cmd->common.opcode)
>>>>>>
>>>>>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
>>>>>> points to a structure within the page fragment allocated by
>>>>>> nvme_tcp_init_request(). This page fragment is used as a command
>>>>>> capsule
>>>>>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
>>>>>> referenced by the 'cmd' member has already been overwritten.
>>>>>>
>>>>>> To fix this problem, when setting up the nvme_command, keep the
>>>>>> opcode in
>>>>>> a newly added member in the nvme_tcp_request and use that instead.
>>>>>
>>>>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
>>>>> data pdu...
>>>>>
>>>>> It will take more space allocated per request (up to a few MBs for lots
>>>>> of requests and controllers)...
>>>>>
>>>>> but will prevent from propagating this to nvme core...
>>>>>
>>>>
>>>> Is there a way we can do this without increasing the memory usage ?
>>>
>>> The data pdu has to go somewhere. There doesn't appear to be another free area
>>> other than the command pdu, so if that's off limits to preserve the original
>>> sqe, then more memory is needed.
>>
>> If I take a somewhat extreme calculation, where a host has 16
>> controllers, 128 queues of depth 128 for each I end up with additional
>> 6MB, which is not insignificant for a driver...
>>
>> We could go with the original proposal of storing just the opcode, but
>> it then propagates to the core...
>>
>> Or...
>>
>> The cmd pdu has a size of 72 bytes (header + 64 byte command), perhaps
>> we can still reuse the same PDU space but rather take the last 24 bytes
>> of the cmd pdu?
>
> This also fixed the issue.
I'm assuming this is equivalent to your Tested-by tag?
> Would adding the following static check help avoid unintended changes
> in the future?
>
> BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) <=
> offsetof(struct nvme_tcp_cmd_pdu, cmd.fabrics.fctype) +
> sizeof(struct nvme_tcp_data_pdu));
Umm, The arithmetic itself in nvme_tcp_req_data_pdu prevents this sort
of thing. What we probably need regardless and is missing is:
--
@@ -2679,6 +2691,15 @@ static struct nvmf_transport_ops
nvme_tcp_transport = {
static int __init nvme_tcp_init_module(void)
{
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
+ BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
+
nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!nvme_tcp_wq)
--
I'll send this and the fix promptly.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-03-13 8:55 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-03-07 16:30 [PATCH] nvme-tcp: print correct opcode on timeout and error handling Akinobu Mita
2023-03-08 11:06 ` Sagi Grimberg
2023-03-08 15:31 ` Akinobu Mita
2023-03-08 23:51 ` Chaitanya Kulkarni
2023-03-08 23:57 ` Keith Busch
2023-03-09 1:46 ` Chaitanya Kulkarni
2023-03-09 9:10 ` Sagi Grimberg
2023-03-12 12:56 ` Akinobu Mita
2023-03-13 8:54 ` Sagi Grimberg
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox