[PATCH] nvme-tcp: delay error recovery after link drop

public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH] nvme-tcp: delay error recovery after link drop
@ 2022-07-14 12:27 Hannes Reinecke
  2022-07-14 14:42 ` Sagi Grimberg
  2022-09-06 17:30 ` John Meneghini
  0 siblings, 2 replies; 7+ messages in thread
From: Hannes Reinecke @ 2022-07-14 12:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, Hannes Reinecke

When the connection unexpectedly closes we must not start error
recovery right away, as the controller might not have registered
the connection failure and so retrying commands directly will
might lead to data corruption.
So wait for KATO before starting error recovery to be on the safe
side; chances are the commands will time out before that anyway.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/nvme/host/tcp.c           | 37 +++++++++++++++++++++----------
 drivers/nvme/target/io-cmd-bdev.c | 18 +++++++++++++++
 2 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a7848e430a5c..70096a2e8762 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -167,7 +167,7 @@ struct nvme_tcp_ctrl {
 	struct sockaddr_storage src_addr;
 	struct nvme_ctrl	ctrl;
 
-	struct work_struct	err_work;
+	struct delayed_work	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
 	u32			io_queues[HCTX_MAX_TYPES];
@@ -522,13 +522,21 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 	queue->ddgst_remaining = 0;
 }
 
-static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl, bool delay)
 {
+	int tmo = delay ? (ctrl->kato ? ctrl->kato : NVME_DEFAULT_KATO) : 0;
+
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 		return;
 
-	dev_warn(ctrl->device, "starting error recovery\n");
-	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
+	if (tmo)
+		dev_warn(ctrl->device,
+			 "delaying error recovery by %d seconds\n", tmo);
+	else
+		dev_warn(ctrl->device,
+			 "starting error recovery\n");
+	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work,
+			   tmo * HZ);
 }
 
 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
@@ -542,7 +550,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 		dev_err(queue->ctrl->ctrl.device,
 			"got bad cqe.command_id %#x on queue %d\n",
 			cqe->command_id, nvme_tcp_queue_id(queue));
-		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
 		return -EINVAL;
 	}
 
@@ -584,7 +592,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 		dev_err(queue->ctrl->ctrl.device,
 			"queue %d tag %#x SUCCESS set but not last PDU\n",
 			nvme_tcp_queue_id(queue), rq->tag);
-		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
 		return -EPROTO;
 	}
 
@@ -895,7 +903,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 			dev_err(queue->ctrl->ctrl.device,
 				"receive failed:  %d\n", result);
 			queue->rd_enabled = false;
-			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+			nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
 			return result;
 		}
 	}
@@ -943,7 +951,12 @@ static void nvme_tcp_state_change(struct sock *sk)
 	case TCP_LAST_ACK:
 	case TCP_FIN_WAIT1:
 	case TCP_FIN_WAIT2:
-		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		/*
+		 * We cannot start error recovery right away, as we need
+		 * to wait for KATO to trigger to give the controller a
+		 * chance to detect the failure.
+		 */
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl, true);
 		break;
 	default:
 		dev_info(queue->ctrl->ctrl.device,
@@ -2171,7 +2184,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
 {
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
-				struct nvme_tcp_ctrl, err_work);
+				struct nvme_tcp_ctrl, err_work.work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
 	nvme_auth_stop(ctrl);
@@ -2195,7 +2208,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 
 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 {
-	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work.work);
 	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
 
 	nvme_tcp_teardown_io_queues(ctrl, shutdown);
@@ -2355,7 +2368,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
 	 * LIVE state should trigger the normal error recovery which will
 	 * handle completing this request.
 	 */
-	nvme_tcp_error_recovery(ctrl);
+	nvme_tcp_error_recovery(ctrl, false);
 	return BLK_EH_RESET_TIMER;
 }
 
@@ -2596,7 +2609,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 
 	INIT_DELAYED_WORK(&ctrl->connect_work,
 			nvme_tcp_reconnect_ctrl_work);
-	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+	INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
 
 	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 27a72504d31c..bbeeeeaedc9b 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -157,6 +157,24 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
 		req->error_loc = offsetof(struct nvme_rw_command, nsid);
 		break;
 	case BLK_STS_IOERR:
+		switch (req->cmd->common.opcode) {
+		case nvme_cmd_read:
+			status = NVME_SC_READ_ERROR;
+			req->error_loc = offsetof(struct nvme_rw_command,
+						  slba);
+			break;
+		case nvme_cmd_write:
+			status = NVME_SC_WRITE_FAULT;
+			req->error_loc = offsetof(struct nvme_rw_command,
+						  slba);
+			break;
+		default:
+			status = NVME_SC_DATA_XFER_ERROR | NVME_SC_DNR;
+			req->error_loc = offsetof(struct nvme_common_command,
+						  opcode);
+			break;
+		}
+		break;
 	default:
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
 		req->error_loc = offsetof(struct nvme_common_command, opcode);
-- 
2.29.2



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-07-14 12:27 [PATCH] nvme-tcp: delay error recovery after link drop Hannes Reinecke
@ 2022-07-14 14:42 ` Sagi Grimberg
  2022-07-14 15:15   ` Hannes Reinecke
  2022-09-06 17:30 ` John Meneghini
  1 sibling, 1 reply; 7+ messages in thread
From: Sagi Grimberg @ 2022-07-14 14:42 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme


> When the connection unexpectedly closes we must not start error
> recovery right away, as the controller might not have registered
> the connection failure and so retrying commands directly will
> might lead to data corruption.
> So wait for KATO before starting error recovery to be on the safe
> side; chances are the commands will time out before that anyway.

We can't just blindly adding kato to the error recovery because that
by definition creates a intrinsic delay for I/O failover. There is
absolutely no reason what-so-ever to do that. kato can be arbitrarily
long.

If the controller needs this delay, then it should signal this
somehow in its capabilities. This is not something that we can just
blindly do. So this needs to go through the TWG.

Also, how come this is TCP specific anyway? This talks about a
controller that has a dangling inflight command that he cannot fence
yet hence cannot serve failover I/O. RDMA should have the same thing.

> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>   drivers/nvme/host/tcp.c           | 37 +++++++++++++++++++++----------
>   drivers/nvme/target/io-cmd-bdev.c | 18 +++++++++++++++
>   2 files changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a7848e430a5c..70096a2e8762 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -167,7 +167,7 @@ struct nvme_tcp_ctrl {
>   	struct sockaddr_storage src_addr;
>   	struct nvme_ctrl	ctrl;
>   
> -	struct work_struct	err_work;
> +	struct delayed_work	err_work;
>   	struct delayed_work	connect_work;
>   	struct nvme_tcp_request async_req;
>   	u32			io_queues[HCTX_MAX_TYPES];
> @@ -522,13 +522,21 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>   	queue->ddgst_remaining = 0;
>   }
>   
> -static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl, bool delay)
>   {
> +	int tmo = delay ? (ctrl->kato ? ctrl->kato : NVME_DEFAULT_KATO) : 0;
> +
>   	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>   		return;
>   
> -	dev_warn(ctrl->device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> +	if (tmo)
> +		dev_warn(ctrl->device,
> +			 "delaying error recovery by %d seconds\n", tmo);
> +	else
> +		dev_warn(ctrl->device,
> +			 "starting error recovery\n");
> +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work,
> +			   tmo * HZ);
>   }
>   
>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> @@ -542,7 +550,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"got bad cqe.command_id %#x on queue %d\n",
>   			cqe->command_id, nvme_tcp_queue_id(queue));
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EINVAL;
>   	}
>   
> @@ -584,7 +592,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"queue %d tag %#x SUCCESS set but not last PDU\n",
>   			nvme_tcp_queue_id(queue), rq->tag);
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EPROTO;
>   	}
>   
> @@ -895,7 +903,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
>   			dev_err(queue->ctrl->ctrl.device,
>   				"receive failed:  %d\n", result);
>   			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +			nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   			return result;
>   		}
>   	}
> @@ -943,7 +951,12 @@ static void nvme_tcp_state_change(struct sock *sk)
>   	case TCP_LAST_ACK:
>   	case TCP_FIN_WAIT1:
>   	case TCP_FIN_WAIT2:
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		/*
> +		 * We cannot start error recovery right away, as we need
> +		 * to wait for KATO to trigger to give the controller a
> +		 * chance to detect the failure.
> +		 */
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, true);
>   		break;
>   	default:
>   		dev_info(queue->ctrl->ctrl.device,
> @@ -2171,7 +2184,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   {
>   	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> -				struct nvme_tcp_ctrl, err_work);
> +				struct nvme_tcp_ctrl, err_work.work);
>   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>   
>   	nvme_auth_stop(ctrl);
> @@ -2195,7 +2208,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   
>   static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
>   {
> -	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
> +	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work.work);
>   	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
>   
>   	nvme_tcp_teardown_io_queues(ctrl, shutdown);
> @@ -2355,7 +2368,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
>   	 * LIVE state should trigger the normal error recovery which will
>   	 * handle completing this request.
>   	 */
> -	nvme_tcp_error_recovery(ctrl);
> +	nvme_tcp_error_recovery(ctrl, false);
>   	return BLK_EH_RESET_TIMER;
>   }
>   
> @@ -2596,7 +2609,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>   
>   	INIT_DELAYED_WORK(&ctrl->connect_work,
>   			nvme_tcp_reconnect_ctrl_work);
> -	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> +	INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
>   	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>   
>   	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 27a72504d31c..bbeeeeaedc9b 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -157,6 +157,24 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
>   		req->error_loc = offsetof(struct nvme_rw_command, nsid);
>   		break;
>   	case BLK_STS_IOERR:
> +		switch (req->cmd->common.opcode) {
> +		case nvme_cmd_read:
> +			status = NVME_SC_READ_ERROR;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		case nvme_cmd_write:
> +			status = NVME_SC_WRITE_FAULT;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		default:
> +			status = NVME_SC_DATA_XFER_ERROR | NVME_SC_DNR;
> +			req->error_loc = offsetof(struct nvme_common_command,
> +						  opcode);
> +			break;
> +		}
> +		break;

unrelated hunk.

>   	default:
>   		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>   		req->error_loc = offsetof(struct nvme_common_command, opcode);


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-07-14 14:42 ` Sagi Grimberg
@ 2022-07-14 15:15   ` Hannes Reinecke
  2022-07-14 16:07     ` Sagi Grimberg
  0 siblings, 1 reply; 7+ messages in thread
From: Hannes Reinecke @ 2022-07-14 15:15 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

On 7/14/22 16:42, Sagi Grimberg wrote:
> 
>> When the connection unexpectedly closes we must not start error
>> recovery right away, as the controller might not have registered
>> the connection failure and so retrying commands directly will
>> might lead to data corruption.
>> So wait for KATO before starting error recovery to be on the safe
>> side; chances are the commands will time out before that anyway.
> 
> We can't just blindly adding kato to the error recovery because that
> by definition creates a intrinsic delay for I/O failover. There is
> absolutely no reason what-so-ever to do that. kato can be arbitrarily
> long.
> 
Yes, true. But the controller might need to register the connection 
timeout and do some cleanup action afterwards, for which KATO is the 
only value we have currently.

> If the controller needs this delay, then it should signal this
> somehow in its capabilities. This is not something that we can just
> blindly do. So this needs to go through the TWG.
> 
... as you might be aware, this is discussed at the TWG.
There even is a TPAR 4129 which deals with precisely this issue.
But consensus is that an immediate retry on path failure is dangerous, 
as the controller might not have detected the path failure yet, and will 
wait for at least KATO before declaring a path dead and start recovery 
actions, potentially clearing out stale/stuck commands.
And retry on another path during that time is dangerous.

And yes, we do have customers who have seen this in real life.

> Also, how come this is TCP specific anyway? This talks about a
> controller that has a dangling inflight command that he cannot fence
> yet hence cannot serve failover I/O. RDMA should have the same thing.
> 
The crucial bit here is the 'nvme_tcp_state_change()' callback, which 
will trigger recovery as soon as it detects a connection failure.

RDMA doesn't have a direct equivalent here, and I'm not that deep into 
RDMA details to know if the RDMA_CM_EVENT_DISCONNECTED is synchronized 
with the remote side. If it is, we're fine. If not, we'll have the same 
issue.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-07-14 15:15   ` Hannes Reinecke
@ 2022-07-14 16:07     ` Sagi Grimberg
  0 siblings, 0 replies; 7+ messages in thread
From: Sagi Grimberg @ 2022-07-14 16:07 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, linux-nvme

>>> When the connection unexpectedly closes we must not start error
>>> recovery right away, as the controller might not have registered
>>> the connection failure and so retrying commands directly will
>>> might lead to data corruption.
>>> So wait for KATO before starting error recovery to be on the safe
>>> side; chances are the commands will time out before that anyway.
>>
>> We can't just blindly adding kato to the error recovery because that
>> by definition creates a intrinsic delay for I/O failover. There is
>> absolutely no reason what-so-ever to do that. kato can be arbitrarily
>> long.
>>
> Yes, true. But the controller might need to register the connection 
> timeout and do some cleanup action afterwards, for which KATO is the 
> only value we have currently.

Yes, but it is orthogonal to the problem you are describing. This is
tying two things that are unrelated. Plus, kato is controlled by the
user and now it affects the failover latency? kato can be 1s, is that
enough? no one knows... its all voodoo.

>> If the controller needs this delay, then it should signal this
>> somehow in its capabilities. This is not something that we can just
>> blindly do. So this needs to go through the TWG.
>>
> ... as you might be aware, this is discussed at the TWG.

I remember discussing this once, and was against doing this
unconditionally. This is by definition a result of a controller
implementation. There are controllers out there that know how
to fence against the issue you are describing and there is no
reason that the host will failover after kato for no reason.

If the controller needs it, the controller needs to reflect
it to the host.

> There even is a TPAR 4129 which deals with precisely this issue.
> But consensus is that an immediate retry on path failure is dangerous, 
> as the controller might not have detected the path failure yet, and will 
> wait for at least KATO before declaring a path dead and start recovery 
> actions, potentially clearing out stale/stuck commands.
> And retry on another path during that time is dangerous.

Dangerous is not enough to universally do this unfortunately. If for
some reason that is unclear to me, there is resistance to have the
controller reflect what it needs, then add a failover_tmo parameter
that is defaulting to 0. The controllers that need a different timeout
then the user can explicitly state it.

> And yes, we do have customers who have seen this in real life.
> 
>> Also, how come this is TCP specific anyway? This talks about a
>> controller that has a dangling inflight command that he cannot fence
>> yet hence cannot serve failover I/O. RDMA should have the same thing.
>>
> The crucial bit here is the 'nvme_tcp_state_change()' callback, which 
> will trigger recovery as soon as it detects a connection failure.
> 
> RDMA doesn't have a direct equivalent here, and I'm not that deep into 
> RDMA details to know if the RDMA_CM_EVENT_DISCONNECTED is synchronized 
> with the remote side. If it is, we're fine. If not, we'll have the same 
> issue.

RDMA_CM_EVENT_DISCONNECTED is unrelated to what you are describing, the
rdma_cm event you are referring to is RDMA_CM_EVENT_TIMEWAIT_EXIT (i.e.
the host send disconnect request and timed out). At least it is the case
afair.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-07-14 12:27 [PATCH] nvme-tcp: delay error recovery after link drop Hannes Reinecke
  2022-07-14 14:42 ` Sagi Grimberg
@ 2022-09-06 17:30 ` John Meneghini
  2022-09-07  7:57   ` Sagi Grimberg
  1 sibling, 1 reply; 7+ messages in thread
From: John Meneghini @ 2022-09-06 17:30 UTC (permalink / raw)
  To: Hannes Reinecke, Christoph Hellwig; +Cc: Keith Busch, Sagi Grimberg, linux-nvme

I think the problem Hannes is trying to address with this patch is: the Spec currently says:

Base 2.0 3.3.2.4

If an NVMe Transport connection is lost as a result of an NVMe Transport error, then before performing
recovery actions related to commands sent on I/O queues associated with that NVMe Transport connection,
the host should wait for at least the longer of:

- the NVMe Keep Alive timeout; or
- the underlying fabric transport timeout, if any.

I'm not sure the NVMe/TCP host stack obeys this rule.

The problem is, when the host fails to follow this rule, some NVMe/TCP controllers are found to corrupt data during cable pull 
tests.

/John

On 7/14/22 08:27, Hannes Reinecke wrote:
> When the connection unexpectedly closes we must not start error
> recovery right away, as the controller might not have registered
> the connection failure and so retrying commands directly will
> might lead to data corruption.
> So wait for KATO before starting error recovery to be on the safe
> side; chances are the commands will time out before that anyway.
> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>   drivers/nvme/host/tcp.c           | 37 +++++++++++++++++++++----------
>   drivers/nvme/target/io-cmd-bdev.c | 18 +++++++++++++++
>   2 files changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a7848e430a5c..70096a2e8762 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -167,7 +167,7 @@ struct nvme_tcp_ctrl {
>   	struct sockaddr_storage src_addr;
>   	struct nvme_ctrl	ctrl;
>   
> -	struct work_struct	err_work;
> +	struct delayed_work	err_work;
>   	struct delayed_work	connect_work;
>   	struct nvme_tcp_request async_req;
>   	u32			io_queues[HCTX_MAX_TYPES];
> @@ -522,13 +522,21 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>   	queue->ddgst_remaining = 0;
>   }
>   
> -static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl, bool delay)
>   {
> +	int tmo = delay ? (ctrl->kato ? ctrl->kato : NVME_DEFAULT_KATO) : 0;
> +
>   	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>   		return;
>   
> -	dev_warn(ctrl->device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> +	if (tmo)
> +		dev_warn(ctrl->device,
> +			 "delaying error recovery by %d seconds\n", tmo);
> +	else
> +		dev_warn(ctrl->device,
> +			 "starting error recovery\n");
> +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work,
> +			   tmo * HZ);
>   }
>   
>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> @@ -542,7 +550,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"got bad cqe.command_id %#x on queue %d\n",
>   			cqe->command_id, nvme_tcp_queue_id(queue));
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EINVAL;
>   	}
>   
> @@ -584,7 +592,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"queue %d tag %#x SUCCESS set but not last PDU\n",
>   			nvme_tcp_queue_id(queue), rq->tag);
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EPROTO;
>   	}
>   
> @@ -895,7 +903,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
>   			dev_err(queue->ctrl->ctrl.device,
>   				"receive failed:  %d\n", result);
>   			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +			nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   			return result;
>   		}
>   	}
> @@ -943,7 +951,12 @@ static void nvme_tcp_state_change(struct sock *sk)
>   	case TCP_LAST_ACK:
>   	case TCP_FIN_WAIT1:
>   	case TCP_FIN_WAIT2:
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		/*
> +		 * We cannot start error recovery right away, as we need
> +		 * to wait for KATO to trigger to give the controller a
> +		 * chance to detect the failure.
> +		 */
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, true);
>   		break;
>   	default:
>   		dev_info(queue->ctrl->ctrl.device,
> @@ -2171,7 +2184,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   {
>   	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> -				struct nvme_tcp_ctrl, err_work);
> +				struct nvme_tcp_ctrl, err_work.work);
>   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>   
>   	nvme_auth_stop(ctrl);
> @@ -2195,7 +2208,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   
>   static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
>   {
> -	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
> +	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work.work);
>   	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
>   
>   	nvme_tcp_teardown_io_queues(ctrl, shutdown);
> @@ -2355,7 +2368,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
>   	 * LIVE state should trigger the normal error recovery which will
>   	 * handle completing this request.
>   	 */
> -	nvme_tcp_error_recovery(ctrl);
> +	nvme_tcp_error_recovery(ctrl, false);
>   	return BLK_EH_RESET_TIMER;
>   }
>   
> @@ -2596,7 +2609,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>   
>   	INIT_DELAYED_WORK(&ctrl->connect_work,
>   			nvme_tcp_reconnect_ctrl_work);
> -	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> +	INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
>   	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>   
>   	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 27a72504d31c..bbeeeeaedc9b 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -157,6 +157,24 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
>   		req->error_loc = offsetof(struct nvme_rw_command, nsid);
>   		break;
>   	case BLK_STS_IOERR:
> +		switch (req->cmd->common.opcode) {
> +		case nvme_cmd_read:
> +			status = NVME_SC_READ_ERROR;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		case nvme_cmd_write:
> +			status = NVME_SC_WRITE_FAULT;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		default:
> +			status = NVME_SC_DATA_XFER_ERROR | NVME_SC_DNR;
> +			req->error_loc = offsetof(struct nvme_common_command,
> +						  opcode);
> +			break;
> +		}
> +		break;
>   	default:
>   		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>   		req->error_loc = offsetof(struct nvme_common_command, opcode);



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-09-06 17:30 ` John Meneghini
@ 2022-09-07  7:57   ` Sagi Grimberg
  2022-09-07  8:41     ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Sagi Grimberg @ 2022-09-07  7:57 UTC (permalink / raw)
  To: John Meneghini, Hannes Reinecke, Christoph Hellwig
  Cc: Keith Busch, linux-nvme


> I think the problem Hannes is trying to address with this patch is: the 
> Spec currently says:
> 
> Base 2.0 3.3.2.4
> 
> If an NVMe Transport connection is lost as a result of an NVMe Transport 
> error, then before performing
> recovery actions related to commands sent on I/O queues associated with 
> that NVMe Transport connection,
> the host should wait for at least the longer of:
> 
> - the NVMe Keep Alive timeout; or
> - the underlying fabric transport timeout, if any.
> 
> I'm not sure the NVMe/TCP host stack obeys this rule.
> 
> The problem is, when the host fails to follow this rule, some NVMe/TCP 
> controllers are found to corrupt data during cable pull tests.

The delay needs to come explicitly from the controller, not guessed
blindly by the host.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nvme-tcp: delay error recovery after link drop
  2022-09-07  7:57   ` Sagi Grimberg
@ 2022-09-07  8:41     ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2022-09-07  8:41 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: John Meneghini, Hannes Reinecke, Christoph Hellwig, Keith Busch,
	linux-nvme

On Wed, Sep 07, 2022 at 10:57:20AM +0300, Sagi Grimberg wrote:
>> I'm not sure the NVMe/TCP host stack obeys this rule.
>>
>> The problem is, when the host fails to follow this rule, some NVMe/TCP 
>> controllers are found to corrupt data during cable pull tests.
>
> The delay needs to come explicitly from the controller, not guessed
> blindly by the host.

No timeout can work properly.  What we need is a proper handshake.


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-09-07  8:57 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-07-14 12:27 [PATCH] nvme-tcp: delay error recovery after link drop Hannes Reinecke
2022-07-14 14:42 ` Sagi Grimberg
2022-07-14 15:15   ` Hannes Reinecke
2022-07-14 16:07     ` Sagi Grimberg
2022-09-06 17:30 ` John Meneghini
2022-09-07  7:57   ` Sagi Grimberg
2022-09-07  8:41     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox