From: Allison Henderson <achender@kernel.org>
To: netdev@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org, pabeni@redhat.com,
edumazet@google.com, rds-devel@oss.oracle.com, kuba@kernel.org,
horms@kernel.org, linux-rdma@vger.kernel.org,
allison.henderson@oracle.com
Subject: [PATCH net-next v5 1/8] net/rds: new extension header: rdma bytes
Date: Mon, 2 Feb 2026 22:57:16 -0700 [thread overview]
Message-ID: <20260203055723.1085751-2-achender@kernel.org> (raw)
In-Reply-To: <20260203055723.1085751-1-achender@kernel.org>
From: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).
In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.
Signed-off-by: Shamir Rabinovitch <shamir.rabinovitch@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/ib_send.c | 40 ++++++++++++++++++++++++-----
net/rds/message.c | 65 +++++++++++++++++++++++++++++++++++++----------
net/rds/rds.h | 25 ++++++++++++++----
net/rds/send.c | 6 ++---
4 files changed, 107 insertions(+), 29 deletions(-)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index f9d28ddd168d..fcd04c29f543 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
- struct rds_ext_header_rdma ext_hdr;
+ struct rds_ext_header_rdma ext_hdr = {};
+ struct rds_ext_header_rdma_bytes
+ rdma_bytes_ext_hdr = {};
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
- rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA,
+ &ext_hdr)) {
+ /* prepare the rdma bytes ext header */
+ rdma_bytes_ext_hdr.h_rflags =
+ rm->rdma.op_write ?
+ RDS_FLAG_RDMA_WR_BYTES :
+ RDS_FLAG_RDMA_RD_BYTES;
+ rdma_bytes_ext_hdr.h_rdma_bytes =
+ cpu_to_be32(rm->rdma.op_bytes);
+ } else {
+ rdsdebug("RDS_EXTHDR_RDMA dropped");
+ }
+
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA_BYTES,
+ &rdma_bytes_ext_hdr)) {
+ /* rdma bytes ext header was added successfully,
+ * notify the remote side via flag in header
+ */
+ rm->m_inc.i_hdr.h_flags |=
+ RDS_FLAG_EXTHDR_EXTENSION;
+ } else {
+ rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped");
+ }
}
- if (rm->m_rdma_cookie) {
- rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
- rds_rdma_cookie_key(rm->m_rdma_cookie),
- rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ if (rm->m_rdma_cookie &&
+ !rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie))) {
+ rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n");
}
/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
diff --git a/net/rds/message.c b/net/rds/message.c
index 199a899a43e9..591a27c9c62f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,6 +44,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
@@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
- hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+ /* see rds_find_next_ext_space for reason why we memset the
+ * ext header
+ */
+ memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
- const void *data, unsigned int len)
+/*
+ * Find the next place we can add an RDS header extension with
+ * specific length. Extension headers are pushed one after the
+ * other. In the following, the number after the colon is the number
+ * of bytes:
+ *
+ * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
+ *
+ * If the extension headers fill the complete extension header space
+ * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
+ */
+static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
+ u8 **ext_start)
{
- unsigned int ext_len = sizeof(u8) + len;
- unsigned char *dst;
+ unsigned int ext_len;
+ unsigned int type;
+ int ind = 0;
+
+ while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
+ if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
+ *ext_start = hdr->h_exthdr + ind;
+ return 0;
+ }
- /* For now, refuse to add more than one extension header */
- if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
- return 0;
+ type = hdr->h_exthdr[ind];
+
+ ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
+ if (!ext_len)
+ return -EINVAL;
+
+ /* ind points to a valid ext hdr with known length */
+ ind += 1 + ext_len;
+ }
+
+ /* no room for extension */
+ return -ENOSPC;
+}
+
+/* The ext hdr space is prefilled with zero from the kzalloc() */
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data)
+{
+ unsigned char *dst;
+ unsigned int len;
- if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ if (!len)
return 0;
- if (ext_len >= RDS_HEADER_EXT_SPACE)
+ if (rds_find_next_ext_space(hdr, len, &dst))
return 0;
- dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
- dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
- return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8a549fe687ac..4b6bf523b412 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -183,10 +183,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
write_pnet(&conn->c_net, net);
}
-#define RDS_FLAG_CONG_BITMAP 0x01
-#define RDS_FLAG_ACK_REQUIRED 0x02
-#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT 255
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_FLAG_EXTHDR_EXTENSION 0x20
+#define RDS_MAX_ADV_CREDIT 255
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
* probe to exchange control information before establishing a connection.
@@ -258,6 +259,20 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
+/*
+ * This extension header tells the peer about delivered RDMA byte count.
+ */
+#define RDS_EXTHDR_RDMA_BYTES 4
+
+struct rds_ext_header_rdma_bytes {
+ __be32 h_rdma_bytes; /* byte count */
+ u8 h_rflags; /* direction of RDMA, write or read */
+ u8 h_pad[3];
+};
+
+#define RDS_FLAG_RDMA_WR_BYTES 0x01
+#define RDS_FLAG_RDMA_RD_BYTES 0x02
+
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
@@ -871,7 +886,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
- unsigned int type, const void *data, unsigned int len);
+ unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
diff --git a/net/rds/send.c b/net/rds/send.c
index 3e3d028bc21e..306785fa7065 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1459,12 +1459,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_NPATHS, &npaths,
- sizeof(npaths));
+ RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
- &my_gen_num,
- sizeof(u32));
+ &my_gen_num);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
--
2.43.0
next prev parent reply other threads:[~2026-02-03 5:57 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-03 5:57 [PATCH net-next v5 0/8] net/rds: RDS-TCP protocol and extension improvements Allison Henderson
2026-02-03 5:57 ` Allison Henderson [this message]
2026-02-03 5:57 ` [PATCH net-next v5 2/8] net/rds: Encode cp_index in TCP source port Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 3/8] net/rds: rds_tcp_conn_path_shutdown must not discard messages Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 4/8] net/rds: Kick-start TCP receiver after accept Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 5/8] net/rds: Clear reconnect pending bit Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 6/8] net/rds: Update struct rds_statistics to use u64 instead of uint64_t Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 7/8] net/rds: Use the first lane until RDS_EXTHDR_NPATHS arrives Allison Henderson
2026-02-03 5:57 ` [PATCH net-next v5 8/8] net/rds: Trigger rds_send_ping() more than once Allison Henderson
2026-02-05 5:00 ` [PATCH net-next v5 0/8] net/rds: RDS-TCP protocol and extension improvements patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260203055723.1085751-2-achender@kernel.org \
--to=achender@kernel.org \
--cc=allison.henderson@oracle.com \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rds-devel@oss.oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.