From: Allison Henderson <achender@kernel.org>
To: netdev@vger.kernel.org
Subject: [RFC 06/15] net/rds: new extension header: rdma bytes
Date: Wed, 22 Oct 2025 12:17:06 -0700 [thread overview]
Message-ID: <20251022191715.157755-7-achender@kernel.org> (raw)
In-Reply-To: <20251022191715.157755-1-achender@kernel.org>
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).
In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.
Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/ib_send.c | 19 +++++++++++++-
net/rds/message.c | 65 +++++++++++++++++++++++++++++++++++++----------
net/rds/rds.h | 24 +++++++++++++----
net/rds/send.c | 6 ++---
4 files changed, 91 insertions(+), 23 deletions(-)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index e35bbb6ffb68..3c13cd1d96e2 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -578,10 +578,27 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
+ struct rds_ext_header_rdma_bytes rdma_bytes_ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ RDS_EXTHDR_RDMA, &ext_hdr);
+
+ /* prepare the rdma bytes ext header */
+ rdma_bytes_ext_hdr.h_rflags = rm->rdma.op_write ?
+ RDS_FLAG_RDMA_WR_BYTES : RDS_FLAG_RDMA_RD_BYTES;
+ rdma_bytes_ext_hdr.h_rdma_bytes =
+ cpu_to_be32(rm->rdma.op_bytes);
+
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA_BYTES,
+ &rdma_bytes_ext_hdr)) {
+ /* rdma bytes ext header was added succesfully,
+ * notify the remote side via flag in header
+ */
+ rm->m_inc.i_hdr.h_flags |=
+ RDS_FLAG_EXTHDR_EXTENSION;
+ }
}
if (rm->m_rdma_cookie) {
rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
diff --git a/net/rds/message.c b/net/rds/message.c
index 199a899a43e9..591a27c9c62f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,6 +44,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
@@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
- hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+ /* see rds_find_next_ext_space for reason why we memset the
+ * ext header
+ */
+ memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
- const void *data, unsigned int len)
+/*
+ * Find the next place we can add an RDS header extension with
+ * specific length. Extension headers are pushed one after the
+ * other. In the following, the number after the colon is the number
+ * of bytes:
+ *
+ * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
+ *
+ * If the extension headers fill the complete extension header space
+ * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
+ */
+static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
+ u8 **ext_start)
{
- unsigned int ext_len = sizeof(u8) + len;
- unsigned char *dst;
+ unsigned int ext_len;
+ unsigned int type;
+ int ind = 0;
+
+ while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
+ if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
+ *ext_start = hdr->h_exthdr + ind;
+ return 0;
+ }
- /* For now, refuse to add more than one extension header */
- if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
- return 0;
+ type = hdr->h_exthdr[ind];
+
+ ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
+ if (!ext_len)
+ return -EINVAL;
+
+ /* ind points to a valid ext hdr with known length */
+ ind += 1 + ext_len;
+ }
+
+ /* no room for extension */
+ return -ENOSPC;
+}
+
+/* The ext hdr space is prefilled with zero from the kzalloc() */
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data)
+{
+ unsigned char *dst;
+ unsigned int len;
- if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ if (!len)
return 0;
- if (ext_len >= RDS_HEADER_EXT_SPACE)
+ if (rds_find_next_ext_space(hdr, len, &dst))
return 0;
- dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
- dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
- return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0c3597ca3f48..569a72c2a2a5 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -184,10 +184,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
write_pnet(&conn->c_net, net);
}
-#define RDS_FLAG_CONG_BITMAP 0x01
-#define RDS_FLAG_ACK_REQUIRED 0x02
-#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT 255
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_FLAG_EXTHDR_EXTENSION 0x20
+#define RDS_MAX_ADV_CREDIT 255
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
* probe to exchange control information before establishing a connection.
@@ -259,6 +260,19 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
+/*
+ * This extension header tells the peer about delivered RDMA byte count.
+ */
+#define RDS_EXTHDR_RDMA_BYTES 4
+
+struct rds_ext_header_rdma_bytes {
+ __be32 h_rdma_bytes; /* byte count */
+ u8 h_rflags; /* direction of RDMA, write or read */
+};
+
+#define RDS_FLAG_RDMA_WR_BYTES 0x01
+#define RDS_FLAG_RDMA_RD_BYTES 0x02
+
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
@@ -871,7 +885,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
- unsigned int type, const void *data, unsigned int len);
+ unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
diff --git a/net/rds/send.c b/net/rds/send.c
index 0ff100dcc7f5..f73facfbe5b0 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1458,12 +1458,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_NPATHS, &npaths,
- sizeof(npaths));
+ RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
- &my_gen_num,
- sizeof(u32));
+ &my_gen_num);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
--
2.43.0
next prev parent reply other threads:[~2025-10-22 19:17 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-22 19:17 [RFC 00/15] net/rds: RDS-TCP bug fix collection Allison Henderson
2025-10-22 19:17 ` [RFC 01/15] net/rds: Add per cp work queue Allison Henderson
2025-10-22 19:17 ` [RFC 02/15] net/rds: Give each connection its own workqueue Allison Henderson
2025-10-22 19:17 ` [RFC 03/15] net/rds: Change return code from rds_send_xmit() when lock is taken Allison Henderson
2025-10-22 19:17 ` [RFC 04/15] net/rds: No shortcut out of RDS_CONN_ERROR Allison Henderson
2025-10-22 19:17 ` [RFC 05/15] net/rds: rds_tcp_accept_one ought to not discard messages Allison Henderson
2025-10-22 19:17 ` Allison Henderson [this message]
2025-10-22 19:17 ` [RFC 07/15] net/rds: Encode cp_index in TCP source port Allison Henderson
2025-10-22 19:17 ` [RFC 08/15] net/rds: rds_tcp_conn_path_shutdown must not discard messages Allison Henderson
2025-10-22 19:17 ` [RFC 09/15] net/rds: Kick-start TCP receiver after accept Allison Henderson
2025-10-22 19:17 ` [RFC 10/15] net/rds: Clear reconnect pending bit Allison Henderson
2025-10-22 19:17 ` [RFC 11/15] net/rds: Use the first lane until RDS_EXTHDR_NPATHS arrives Allison Henderson
2025-10-22 19:17 ` [RFC 12/15] net/rds: Trigger rds_send_ping() more than once Allison Henderson
2025-10-22 19:17 ` [RFC 13/15] net/rds: Delegate fan-out to a background worker Allison Henderson
2025-10-22 19:17 ` [RFC 14/15] net/rds: Use proper peer port number even when not connected Allison Henderson
2025-10-22 19:17 ` [RFC 15/15] net/rds: rds_sendmsg should not discard payload_len Allison Henderson
2025-10-22 22:04 ` [RFC 00/15] net/rds: RDS-TCP bug fix collection Allison Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251022191715.157755-7-achender@kernel.org \
--to=achender@kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).