From: Allison Henderson <achender@kernel.org>
To: netdev@vger.kernel.org
Subject: [RFC 06/15] net/rds: new extension header: rdma bytes
Date: Wed, 22 Oct 2025 12:17:06 -0700 [thread overview]
Message-ID: <20251022191715.157755-7-achender@kernel.org> (raw)
In-Reply-To: <20251022191715.157755-1-achender@kernel.org>
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).
In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.
Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/ib_send.c | 19 +++++++++++++-
net/rds/message.c | 65 +++++++++++++++++++++++++++++++++++++----------
net/rds/rds.h | 24 +++++++++++++----
net/rds/send.c | 6 ++---
4 files changed, 91 insertions(+), 23 deletions(-)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index e35bbb6ffb68..3c13cd1d96e2 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -578,10 +578,27 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
+ struct rds_ext_header_rdma_bytes rdma_bytes_ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ RDS_EXTHDR_RDMA, &ext_hdr);
+
+ /* prepare the rdma bytes ext header */
+ rdma_bytes_ext_hdr.h_rflags = rm->rdma.op_write ?
+ RDS_FLAG_RDMA_WR_BYTES : RDS_FLAG_RDMA_RD_BYTES;
+ rdma_bytes_ext_hdr.h_rdma_bytes =
+ cpu_to_be32(rm->rdma.op_bytes);
+
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA_BYTES,
+ &rdma_bytes_ext_hdr)) {
+ /* rdma bytes ext header was added succesfully,
+ * notify the remote side via flag in header
+ */
+ rm->m_inc.i_hdr.h_flags |=
+ RDS_FLAG_EXTHDR_EXTENSION;
+ }
}
if (rm->m_rdma_cookie) {
rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
diff --git a/net/rds/message.c b/net/rds/message.c
index 199a899a43e9..591a27c9c62f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,6 +44,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
@@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
- hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+ /* see rds_find_next_ext_space for reason why we memset the
+ * ext header
+ */
+ memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
- const void *data, unsigned int len)
+/*
+ * Find the next place we can add an RDS header extension with
+ * specific length. Extension headers are pushed one after the
+ * other. In the following, the number after the colon is the number
+ * of bytes:
+ *
+ * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
+ *
+ * If the extension headers fill the complete extension header space
+ * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
+ */
+static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
+ u8 **ext_start)
{
- unsigned int ext_len = sizeof(u8) + len;
- unsigned char *dst;
+ unsigned int ext_len;
+ unsigned int type;
+ int ind = 0;
+
+ while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
+ if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
+ *ext_start = hdr->h_exthdr + ind;
+ return 0;
+ }
- /* For now, refuse to add more than one extension header */
- if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
- return 0;
+ type = hdr->h_exthdr[ind];
+
+ ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
+ if (!ext_len)
+ return -EINVAL;
+
+ /* ind points to a valid ext hdr with known length */
+ ind += 1 + ext_len;
+ }
+
+ /* no room for extension */
+ return -ENOSPC;
+}
+
+/* The ext hdr space is prefilled with zero from the kzalloc() */
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data)
+{
+ unsigned char *dst;
+ unsigned int len;
- if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ if (!len)
return 0;
- if (ext_len >= RDS_HEADER_EXT_SPACE)
+ if (rds_find_next_ext_space(hdr, len, &dst))
return 0;
- dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
- dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
- return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0c3597ca3f48..569a72c2a2a5 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -184,10 +184,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
write_pnet(&conn->c_net, net);
}
-#define RDS_FLAG_CONG_BITMAP 0x01
-#define RDS_FLAG_ACK_REQUIRED 0x02
-#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT 255
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_FLAG_EXTHDR_EXTENSION 0x20
+#define RDS_MAX_ADV_CREDIT 255
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
* probe to exchange control information before establishing a connection.
@@ -259,6 +260,19 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
+/*
+ * This extension header tells the peer about delivered RDMA byte count.
+ */
+#define RDS_EXTHDR_RDMA_BYTES 4
+
+struct rds_ext_header_rdma_bytes {
+ __be32 h_rdma_bytes; /* byte count */
+ u8 h_rflags; /* direction of RDMA, write or read */
+};
+
+#define RDS_FLAG_RDMA_WR_BYTES 0x01
+#define RDS_FLAG_RDMA_RD_BYTES 0x02
+
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
@@ -871,7 +885,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
- unsigned int type, const void *data, unsigned int len);
+ unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
diff --git a/net/rds/send.c b/net/rds/send.c
index 0ff100dcc7f5..f73facfbe5b0 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1458,12 +1458,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_NPATHS, &npaths,
- sizeof(npaths));
+ RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
- &my_gen_num,
- sizeof(u32));
+ &my_gen_num);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
--
2.43.0
next prev parent reply other threads:[~2025-10-22 19:17 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-22 19:17 [RFC 00/15] net/rds: RDS-TCP bug fix collection Allison Henderson
2025-10-22 19:17 ` [RFC 01/15] net/rds: Add per cp work queue Allison Henderson
2025-10-22 19:17 ` [RFC 02/15] net/rds: Give each connection its own workqueue Allison Henderson
2025-10-26 0:03 ` kernel test robot
2025-10-22 19:17 ` [RFC 03/15] net/rds: Change return code from rds_send_xmit() when lock is taken Allison Henderson
2025-10-22 19:17 ` [RFC 04/15] net/rds: No shortcut out of RDS_CONN_ERROR Allison Henderson
2025-10-22 19:17 ` [RFC 05/15] net/rds: rds_tcp_accept_one ought to not discard messages Allison Henderson
2025-10-22 19:17 ` Allison Henderson [this message]
2025-10-22 19:17 ` [RFC 07/15] net/rds: Encode cp_index in TCP source port Allison Henderson
2025-10-22 19:17 ` [RFC 08/15] net/rds: rds_tcp_conn_path_shutdown must not discard messages Allison Henderson
2025-10-22 19:17 ` [RFC 09/15] net/rds: Kick-start TCP receiver after accept Allison Henderson
2025-10-22 19:17 ` [RFC 10/15] net/rds: Clear reconnect pending bit Allison Henderson
2025-10-22 19:17 ` [RFC 11/15] net/rds: Use the first lane until RDS_EXTHDR_NPATHS arrives Allison Henderson
2025-10-22 19:17 ` [RFC 12/15] net/rds: Trigger rds_send_ping() more than once Allison Henderson
2025-10-22 19:17 ` [RFC 13/15] net/rds: Delegate fan-out to a background worker Allison Henderson
2025-10-22 19:17 ` [RFC 14/15] net/rds: Use proper peer port number even when not connected Allison Henderson
2025-10-22 19:17 ` [RFC 15/15] net/rds: rds_sendmsg should not discard payload_len Allison Henderson
2025-10-22 22:04 ` [RFC 00/15] net/rds: RDS-TCP bug fix collection Allison Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251022191715.157755-7-achender@kernel.org \
--to=achender@kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.