From: Hannes Reinecke <hare@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>,
linux-nvme@lists.infradead.org, Hannes Reinecke <hare@kernel.org>
Subject: [PATCH 1/3] nvme-tcp: improve rx/tx fairness
Date: Mon, 8 Jul 2024 09:10:11 +0200 [thread overview]
Message-ID: <20240708071013.69984-2-hare@kernel.org> (raw)
In-Reply-To: <20240708071013.69984-1-hare@kernel.org>
We need to restrict both side, rx and tx, to only run for a certain time
to ensure that we're not blocking the other side and induce starvation.
So pass in a 'deadline' value to nvme_tcp_send_all() and nvme_tcp_try_recv()
and break out of the loop if the deadline is reached.
As we now have a timestamp we can also use it to print out a warning
if the actual time spent exceeds the deadline.
Performance comparison:
baseline rx/tx fairness
4k seq write: 449MiB/s 480MiB/s
4k rand write: 410MiB/s 481MiB/s
4k seq read: 478MiB/s 481MiB/s
4k rand read: 547MiB/s 480MiB/s
Random read is ever so disappointing, but that will be fixed with the later
patches.
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
drivers/nvme/host/tcp.c | 38 +++++++++++++++++++++++++++++---------
1 file changed, 29 insertions(+), 9 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0873b3949355..f621d3ba89b2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -153,6 +153,7 @@ struct nvme_tcp_queue {
size_t data_remaining;
size_t ddgst_remaining;
unsigned int nr_cqe;
+ unsigned long deadline;
/* send state */
struct nvme_tcp_request *request;
@@ -359,14 +360,18 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
}
-static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+static inline int nvme_tcp_send_all(struct nvme_tcp_queue *queue,
+ unsigned long deadline)
{
int ret;
/* drain the send queue as much as we can... */
do {
ret = nvme_tcp_try_send(queue);
+ if (time_after(jiffies, deadline))
+ break;
} while (ret > 0);
+ return ret;
}
static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
@@ -385,6 +390,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
bool empty;
empty = llist_add(&req->lentry, &queue->req_list) &&
@@ -397,7 +403,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
*/
if (queue->io_cpu == raw_smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
- nvme_tcp_send_all(queue);
+ nvme_tcp_send_all(queue, deadline);
mutex_unlock(&queue->send_mutex);
}
@@ -959,9 +965,14 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return result;
}
+ if (time_after(jiffies, queue->deadline)) {
+ desc->count = 0;
+ break;
+ }
+
}
- return consumed;
+ return consumed - len;
}
static void nvme_tcp_data_ready(struct sock *sk)
@@ -1258,7 +1269,7 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
return ret;
}
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue, unsigned long deadline)
{
struct socket *sock = queue->sock;
struct sock *sk = sock->sk;
@@ -1269,6 +1280,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
rd_desc.count = 1;
lock_sock(sk);
queue->nr_cqe = 0;
+ queue->deadline = deadline;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
@@ -1278,14 +1290,15 @@ static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
- unsigned long deadline = jiffies + msecs_to_jiffies(1);
+ unsigned long tx_deadline = jiffies + msecs_to_jiffies(1);
+ unsigned long rx_deadline = tx_deadline + msecs_to_jiffies(1), overrun;
do {
bool pending = false;
int result;
if (mutex_trylock(&queue->send_mutex)) {
- result = nvme_tcp_try_send(queue);
+ result = nvme_tcp_send_all(queue, tx_deadline);
mutex_unlock(&queue->send_mutex);
if (result > 0)
pending = true;
@@ -1293,7 +1306,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
break;
}
- result = nvme_tcp_try_recv(queue);
+ result = nvme_tcp_try_recv(queue, rx_deadline);
if (result > 0)
pending = true;
else if (unlikely(result < 0))
@@ -1302,7 +1315,13 @@ static void nvme_tcp_io_work(struct work_struct *w)
if (!pending || !queue->rd_enabled)
return;
- } while (!time_after(jiffies, deadline)); /* quota is exhausted */
+ } while (!time_after(jiffies, rx_deadline)); /* quota is exhausted */
+
+ overrun = jiffies - rx_deadline;
+ if (nvme_tcp_queue_id(queue) > 0 &&
+ overrun > msecs_to_jiffies(10))
+ dev_dbg(queue->ctrl->ctrl.device, "queue %d: queue stall (%u msecs)\n",
+ nvme_tcp_queue_id(queue), jiffies_to_msecs(overrun));
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
@@ -2666,6 +2685,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
@@ -2673,7 +2693,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
- nvme_tcp_try_recv(queue);
+ nvme_tcp_try_recv(queue, deadline);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}
--
2.35.3
next prev parent reply other threads:[~2024-07-08 7:10 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-08 7:10 [PATCHv2 0/3] nvme-tcp: improve scalability Hannes Reinecke
2024-07-08 7:10 ` Hannes Reinecke [this message]
2024-07-08 11:57 ` [PATCH 1/3] nvme-tcp: improve rx/tx fairness Sagi Grimberg
2024-07-08 13:21 ` Hannes Reinecke
2024-07-08 14:25 ` Sagi Grimberg
2024-07-08 15:50 ` Hannes Reinecke
2024-07-08 19:31 ` Sagi Grimberg
2024-07-09 6:51 ` Hannes Reinecke
2024-07-09 7:06 ` Sagi Grimberg
2024-07-08 7:10 ` [PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping Hannes Reinecke
2024-07-08 12:08 ` Sagi Grimberg
2024-07-08 12:43 ` Hannes Reinecke
2024-07-08 14:38 ` Sagi Grimberg
2024-07-08 7:10 ` [PATCH 3/3] nvme-tcp: per-controller I/O workqueues Hannes Reinecke
2024-07-08 12:12 ` Sagi Grimberg
2024-07-08 12:48 ` Hannes Reinecke
2024-07-08 14:41 ` Sagi Grimberg
2024-07-10 11:56 ` [PATCHv2 0/3] nvme-tcp: improve scalability Sagi Grimberg
2024-07-10 14:06 ` Hannes Reinecke
2024-07-10 14:45 ` Sagi Grimberg
2024-07-16 6:31 ` Sagi Grimberg
2024-07-16 7:10 ` Hannes Reinecke
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240708071013.69984-2-hare@kernel.org \
--to=hare@kernel.org \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.