From: Allison Henderson <achender@kernel.org>
To: netdev@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org, pabeni@redhat.com,
edumazet@google.com, rds-devel@oss.oracle.com, kuba@kernel.org,
horms@kernel.org, linux-rdma@vger.kernel.org,
allison.henderson@oracle.com
Subject: [PATCH net-next v4 2/8] net/rds: Encode cp_index in TCP source port
Date: Fri, 30 Jan 2026 18:25:01 -0700 [thread overview]
Message-ID: <20260131012507.814119-3-achender@kernel.org> (raw)
In-Reply-To: <20260131012507.814119-1-achender@kernel.org>
From: Gerd Rausch <gerd.rausch@oracle.com>
Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").
However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.
Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.
However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.
So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.
A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
net/rds/message.c | 1 +
net/rds/rds.h | 3 +++
net/rds/recv.c | 7 +++++++
net/rds/send.c | 4 ++++
net/rds/tcp.h | 1 +
net/rds/tcp_connect.c | 23 +++++++++++++++++++++-
net/rds/tcp_listen.c | 45 +++++++++++++++++++++++++++++++++++++------
7 files changed, 77 insertions(+), 7 deletions(-)
diff --git a/net/rds/message.c b/net/rds/message.c
index 591a27c9c62f..54fd000806ea 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
+[RDS_EXTHDR_SPORT_IDX] = 1,
};
void rds_message_addref(struct rds_message *rm)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4b6bf523b412..5b5fb53b1fc5 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -147,6 +147,7 @@ struct rds_connection {
c_ping_triggered:1,
c_pad_to_32:29;
int c_npaths;
+ bool c_with_sport_idx;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@@ -278,8 +279,10 @@ struct rds_ext_header_rdma_bytes {
*/
#define RDS_EXTHDR_NPATHS 5
#define RDS_EXTHDR_GEN_NUM 6
+#define RDS_EXTHDR_SPORT_IDX 8
#define __RDS_EXTHDR_MAX 16 /* for now */
+
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
#define RDS_MSG_RX_HDR 0
#define RDS_MSG_RX_START 1
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 66680f652e74..ddf128a02347 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
struct rds_ext_header_version version;
__be16 rds_npaths;
__be32 rds_gen_num;
+ u8 dummy;
} buffer;
+ bool new_with_sport_idx = false;
u32 new_peer_gen_num = 0;
while (1) {
@@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
break;
+ case RDS_EXTHDR_SPORT_IDX:
+ new_with_sport_idx = true;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
}
}
+
+ conn->c_with_sport_idx = new_with_sport_idx;
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
conn->c_ping_triggered = 0;
diff --git a/net/rds/send.c b/net/rds/send.c
index 306785fa7065..85e1c5352ad8 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
cp->cp_conn->c_trans->t_mp_capable) {
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+ u8 dummy = 0;
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
&my_gen_num);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_SPORT_IDX,
+ &dummy);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 7d07128593b7..7c91974fcde7 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -34,6 +34,7 @@ struct rds_tcp_connection {
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
+ u32 t_client_port_group;
struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 92891b0d224d..4947ee73bad0 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -93,6 +93,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
struct sockaddr *addr;
+ int port_low, port_high, port;
+ int port_groups, groups_left;
int addrlen;
bool isv6;
int ret;
@@ -145,7 +147,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
addrlen = sizeof(sin);
}
- ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
+ /* encode cp->cp_index in lowest bits of source-port */
+ inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
+ port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
+ port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
+ ret = -EADDRINUSE;
+ groups_left = port_groups;
+ while (groups_left-- > 0 && ret) {
+ if (++tc->t_client_port_group >= port_groups)
+ tc->t_client_port_group = 0;
+ port = port_low +
+ tc->t_client_port_group * RDS_MPATH_WORKERS +
+ cp->cp_index;
+
+ if (isv6)
+ sin6.sin6_port = htons(port);
+ else
+ sin.sin_port = htons(port);
+ ret = kernel_bind(sock, (struct sockaddr_unsized *)addr,
+ addrlen);
+ }
if (ret) {
rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 551c847f2890..8129ea9da31c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock)
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
*/
-static
-struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+static struct rds_tcp_connection *
+rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
{
- int i;
- int npaths = max_t(int, 1, conn->c_npaths);
+ union {
+ struct sockaddr_storage storage;
+ struct sockaddr addr;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } saddr;
+ int sport, npaths, i_min, i_max, i;
+
+ if (conn->c_with_sport_idx &&
+ kernel_getpeername(sock, &saddr.addr) >= 0) {
+ /* cp->cp_index is encoded in lowest bits of source-port */
+ switch (saddr.addr.sa_family) {
+ case AF_INET:
+ sport = ntohs(saddr.sin.sin_port);
+ break;
+ case AF_INET6:
+ sport = ntohs(saddr.sin6.sin6_port);
+ break;
+ default:
+ sport = -1;
+ }
+ } else {
+ sport = -1;
+ }
+
+ npaths = max_t(int, 1, conn->c_npaths);
- for (i = 0; i < npaths; i++) {
+ if (sport >= 0) {
+ i_min = sport % npaths;
+ i_max = i_min;
+ } else {
+ i_min = 0;
+ i_max = npaths - 1;
+ }
+
+ for (i = i_min; i <= i_max; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
RDS_CONN_CONNECTING))
return cp->cp_transport_data;
}
+
return NULL;
}
@@ -199,7 +232,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn)
* to and discarded by the sender.
* We must not throw those away!
*/
- rs_tcp = rds_tcp_accept_one_path(conn);
+ rs_tcp = rds_tcp_accept_one_path(conn, new_sock);
if (!rs_tcp) {
/* It's okay to stash "new_sock", since
* "rds_tcp_conn_slots_available" triggers
--
2.43.0
next prev parent reply other threads:[~2026-01-31 1:25 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-31 1:24 [PATCH net-next v4 0/8] net/rds: RDS-TCP protocol and extension improvements Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 1/8] net/rds: new extension header: rdma bytes Allison Henderson
2026-01-31 1:25 ` Allison Henderson [this message]
2026-01-31 1:25 ` [PATCH net-next v4 3/8] net/rds: rds_tcp_conn_path_shutdown must not discard messages Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 4/8] net/rds: Kick-start TCP receiver after accept Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 5/8] net/rds: Clear reconnect pending bit Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 6/8] net/rds: Update struct rds_statistics to use u64 instead of uint64_t Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 7/8] net/rds: Use the first lane until RDS_EXTHDR_NPATHS arrives Allison Henderson
2026-01-31 1:25 ` [PATCH net-next v4 8/8] net/rds: Trigger rds_send_ping() more than once Allison Henderson
2026-02-03 1:55 ` Jakub Kicinski
2026-02-03 6:41 ` Allison Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260131012507.814119-3-achender@kernel.org \
--to=achender@kernel.org \
--cc=allison.henderson@oracle.com \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rds-devel@oss.oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.