From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>,
linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
Christoph Hellwig <hch@lst.de>, Keith Busch <kbusch@kernel.org>
Cc: Yi Zhang <yi.zhang@redhat.com>, Sagi Grimberg <sagi@grimberg.me>,
Chao Leng <lengchao@huawei.com>, Ming Lei <ming.lei@redhat.com>
Subject: [PATCH 4/4] nvme: tcp: complete non-IO requests atomically
Date: Fri, 16 Oct 2020 22:28:11 +0800 [thread overview]
Message-ID: <20201016142811.1262214-5-ming.lei@redhat.com> (raw)
In-Reply-To: <20201016142811.1262214-1-ming.lei@redhat.com>
During controller's CONNECTING state, admin/fabric/connect requests
are submitted for recovery, and we allow to abort this request directly
in time out handler for not blocking the setup step.
So timout vs. normal completion race may be triggered on these requests.
Add atomic completion for requests from connect/fabric/admin queue for
avoiding the race.
CC: Chao Leng <lengchao@huawei.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/nvme/host/tcp.c | 40 +++++++++++++++++++++++++++++++++++++---
1 file changed, 37 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 56ac61a90c1b..654061abdc5a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -30,6 +30,8 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+#define REQ_STATE_COMPLETE 0
+
enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU,
@@ -56,6 +58,8 @@ struct nvme_tcp_request {
size_t offset;
size_t data_sent;
enum nvme_tcp_send_state state;
+
+ unsigned long comp_state;
};
enum nvme_tcp_queue_flags {
@@ -469,6 +473,33 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
}
+/*
+ * requests originated from admin, fabrics and connect_q have to be
+ * completed atomically because we don't cover the race between timeout
+ * and normal completion for these queues.
+ */
+static inline bool nvme_tcp_need_atomic_complete(struct request *rq)
+{
+ return !rq->rq_disk;
+}
+
+static inline void nvme_tcp_clear_rq_complete(struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+ if (unlikely(nvme_tcp_need_atomic_complete(rq)))
+ clear_bit(REQ_STATE_COMPLETE, &req->comp_state);
+}
+
+static inline bool nvme_tcp_mark_rq_complete(struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+ if (unlikely(nvme_tcp_need_atomic_complete(rq)))
+ return !test_and_set_bit(REQ_STATE_COMPLETE, &req->comp_state);
+ return true;
+}
+
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
struct nvme_completion *cqe)
{
@@ -483,7 +514,8 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
return -EINVAL;
}
- if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
+ if (nvme_tcp_mark_rq_complete(rq) &&
+ !nvme_try_complete_req(rq, cqe->status, cqe->result))
nvme_complete_rq(rq);
queue->nr_cqe++;
@@ -674,7 +706,8 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{
union nvme_result res = {};
- if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
+ if (nvme_tcp_mark_rq_complete(rq) &&
+ !nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
nvme_complete_rq(rq);
}
@@ -2173,7 +2206,7 @@ static void nvme_tcp_complete_timed_out(struct request *rq)
/* fence other contexts that may complete the command */
mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
- if (!blk_mq_request_completed(rq)) {
+ if (nvme_tcp_mark_rq_complete(rq) && !blk_mq_request_completed(rq)) {
nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
blk_mq_complete_request(rq);
}
@@ -2315,6 +2348,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(ret))
return ret;
+ nvme_tcp_clear_rq_complete(rq);
blk_mq_start_request(rq);
nvme_tcp_queue_request(req, true, bd->last);
--
2.25.2
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
next prev parent reply other threads:[~2020-10-16 14:29 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-16 14:28 [PATCH 0/4] blk-mq/nvme-tcp: fix timed out related races Ming Lei
2020-10-16 14:28 ` [PATCH 1/4] blk-mq: check rq->state explicitly in blk_mq_tagset_count_completed_rqs Ming Lei
2020-10-19 0:50 ` Ming Lei
2020-10-16 14:28 ` [PATCH 2/4] blk-mq: think request as completed if it isn't IN_FLIGHT Ming Lei
2020-10-16 14:28 ` [PATCH 3/4] nvme: tcp: fix race between timeout and normal completion Ming Lei
2020-10-20 8:11 ` Sagi Grimberg
2020-10-20 9:44 ` Ming Lei
2020-10-16 14:28 ` Ming Lei [this message]
2020-10-20 8:14 ` [PATCH 4/4] nvme: tcp: complete non-IO requests atomically Sagi Grimberg
2020-10-20 7:32 ` [PATCH 0/4] blk-mq/nvme-tcp: fix timed out related races Yi Zhang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201016142811.1262214-5-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=lengchao@huawei.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
--cc=yi.zhang@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox