Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>,
	linux-nvme@lists.infradead.org, Hannes Reinecke <hare@kernel.org>
Subject: [PATCH 1/3] nvme-tcp: improve rx/tx fairness
Date: Mon,  8 Jul 2024 09:10:11 +0200	[thread overview]
Message-ID: <20240708071013.69984-2-hare@kernel.org> (raw)
In-Reply-To: <20240708071013.69984-1-hare@kernel.org>

We need to restrict both side, rx and tx, to only run for a certain time
to ensure that we're not blocking the other side and induce starvation.
So pass in a 'deadline' value to nvme_tcp_send_all() and nvme_tcp_try_recv()
and break out of the loop if the deadline is reached.

As we now have a timestamp we can also use it to print out a warning
if the actual time spent exceeds the deadline.

Performance comparison:
               baseline rx/tx fairness
4k seq write:  449MiB/s 480MiB/s
4k rand write: 410MiB/s 481MiB/s
4k seq read:   478MiB/s 481MiB/s
4k rand read:  547MiB/s 480MiB/s

Random read is ever so disappointing, but that will be fixed with the later
patches.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0873b3949355..f621d3ba89b2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -153,6 +153,7 @@ struct nvme_tcp_queue {
 	size_t			data_remaining;
 	size_t			ddgst_remaining;
 	unsigned int		nr_cqe;
+	unsigned long		deadline;
 
 	/* send state */
 	struct nvme_tcp_request *request;
@@ -359,14 +360,18 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 	}
 }
 
-static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+static inline int nvme_tcp_send_all(struct nvme_tcp_queue *queue,
+				    unsigned long deadline)
 {
 	int ret;
 
 	/* drain the send queue as much as we can... */
 	do {
 		ret = nvme_tcp_try_send(queue);
+		if (time_after(jiffies, deadline))
+			break;
 	} while (ret > 0);
+	return ret;
 }
 
 static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
@@ -385,6 +390,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		bool sync, bool last)
 {
 	struct nvme_tcp_queue *queue = req->queue;
+	unsigned long deadline = jiffies + msecs_to_jiffies(1);
 	bool empty;
 
 	empty = llist_add(&req->lentry, &queue->req_list) &&
@@ -397,7 +403,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	 */
 	if (queue->io_cpu == raw_smp_processor_id() &&
 	    sync && empty && mutex_trylock(&queue->send_mutex)) {
-		nvme_tcp_send_all(queue);
+		nvme_tcp_send_all(queue, deadline);
 		mutex_unlock(&queue->send_mutex);
 	}
 
@@ -959,9 +965,14 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 			return result;
 		}
+		if (time_after(jiffies, queue->deadline)) {
+			desc->count = 0;
+			break;
+		}
+
 	}
 
-	return consumed;
+	return consumed - len;
 }
 
 static void nvme_tcp_data_ready(struct sock *sk)
@@ -1258,7 +1269,7 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
 	return ret;
 }
 
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue, unsigned long deadline)
 {
 	struct socket *sock = queue->sock;
 	struct sock *sk = sock->sk;
@@ -1269,6 +1280,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
 	rd_desc.count = 1;
 	lock_sock(sk);
 	queue->nr_cqe = 0;
+	queue->deadline = deadline;
 	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
 	release_sock(sk);
 	return consumed;
@@ -1278,14 +1290,15 @@ static void nvme_tcp_io_work(struct work_struct *w)
 {
 	struct nvme_tcp_queue *queue =
 		container_of(w, struct nvme_tcp_queue, io_work);
-	unsigned long deadline = jiffies + msecs_to_jiffies(1);
+	unsigned long tx_deadline = jiffies + msecs_to_jiffies(1);
+	unsigned long rx_deadline = tx_deadline + msecs_to_jiffies(1), overrun;
 
 	do {
 		bool pending = false;
 		int result;
 
 		if (mutex_trylock(&queue->send_mutex)) {
-			result = nvme_tcp_try_send(queue);
+			result = nvme_tcp_send_all(queue, tx_deadline);
 			mutex_unlock(&queue->send_mutex);
 			if (result > 0)
 				pending = true;
@@ -1293,7 +1306,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
 				break;
 		}
 
-		result = nvme_tcp_try_recv(queue);
+		result = nvme_tcp_try_recv(queue, rx_deadline);
 		if (result > 0)
 			pending = true;
 		else if (unlikely(result < 0))
@@ -1302,7 +1315,13 @@ static void nvme_tcp_io_work(struct work_struct *w)
 		if (!pending || !queue->rd_enabled)
 			return;
 
-	} while (!time_after(jiffies, deadline)); /* quota is exhausted */
+	} while (!time_after(jiffies, rx_deadline)); /* quota is exhausted */
+
+	overrun = jiffies - rx_deadline;
+	if (nvme_tcp_queue_id(queue) > 0 &&
+	    overrun > msecs_to_jiffies(10))
+		dev_dbg(queue->ctrl->ctrl.device, "queue %d: queue stall (%u msecs)\n",
+			nvme_tcp_queue_id(queue), jiffies_to_msecs(overrun));
 
 	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 }
@@ -2666,6 +2685,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct nvme_tcp_queue *queue = hctx->driver_data;
 	struct sock *sk = queue->sock->sk;
+	unsigned long deadline = jiffies + msecs_to_jiffies(1);
 
 	if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
 		return 0;
@@ -2673,7 +2693,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
 		sk_busy_loop(sk, true);
-	nvme_tcp_try_recv(queue);
+	nvme_tcp_try_recv(queue, deadline);
 	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
 	return queue->nr_cqe;
 }
-- 
2.35.3



  reply	other threads:[~2024-07-08  7:10 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-08  7:10 [PATCHv2 0/3] nvme-tcp: improve scalability Hannes Reinecke
2024-07-08  7:10 ` Hannes Reinecke [this message]
2024-07-08 11:57   ` [PATCH 1/3] nvme-tcp: improve rx/tx fairness Sagi Grimberg
2024-07-08 13:21     ` Hannes Reinecke
2024-07-08 14:25       ` Sagi Grimberg
2024-07-08 15:50         ` Hannes Reinecke
2024-07-08 19:31           ` Sagi Grimberg
2024-07-09  6:51             ` Hannes Reinecke
2024-07-09  7:06               ` Sagi Grimberg
2024-07-08  7:10 ` [PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
2024-07-08 12:08   ` Sagi Grimberg
2024-07-08 12:43     ` Hannes Reinecke
2024-07-08 14:38       ` Sagi Grimberg
2024-07-08  7:10 ` [PATCH 3/3] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
2024-07-08 12:12   ` Sagi Grimberg
2024-07-08 12:48     ` Hannes Reinecke
2024-07-08 14:41       ` Sagi Grimberg
2024-07-10 11:56 ` [PATCHv2 0/3] nvme-tcp: improve scalability Sagi Grimberg
2024-07-10 14:06   ` Hannes Reinecke
2024-07-10 14:45     ` Sagi Grimberg
2024-07-16  6:31 ` Sagi Grimberg
2024-07-16  7:10   ` Hannes Reinecke

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240708071013.69984-2-hare@kernel.org \
    --to=hare@kernel.org \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox