From: Eric Dumazet <edumazet@google.com>
To: "David S . Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>,
Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>,
Neal Cardwell <ncardwell@google.com>,
Kuniyuki Iwashima <kuniyu@google.com>,
netdev@vger.kernel.org, eric.dumazet@gmail.com,
Eric Dumazet <edumazet@google.com>
Subject: [PATCH net-next] tcp: reduce tcp sockets size by one cache line
Date: Thu, 29 Jan 2026 15:34:58 +0000 [thread overview]
Message-ID: <20260129153458.4163797-1-edumazet@google.com> (raw)
By default, when a kmem_cache is created with SLAB_TYPESAFE_BY_RCU,
slub has to use extra storage for the freelist pointer after each
object, because slub assumes that any bit in the object
can be used by RCU readers.
Because proto_register() is also using SLAB_HWCACHE_ALIGN,
this forces slub to use one extra cache line per object.
We can instead put the slub freelist anywhere in the object,
granted the concurrent RCU readers are not supposed to
use the pointer value.
Add a new (struct sock)sk_freeptr field, in an union
with sk_rcu: No RCU readers would need to look at sk_rcu,
which is only used at free phase.
Tested:
grep . /sys/kernel/slab/TCP/{object_size,slab_size,objs_per_slab}
grep . /sys/kernel/slab/TCPv6/{object_size,slab_size,objs_per_slab}
Before:
/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2432
/sys/kernel/slab/TCP/objs_per_slab:13
/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2560
/sys/kernel/slab/TCPv6/objs_per_slab:12
After this patch, we can pack one more TCPv6 object per slab,
and object_size == slab_size.
/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2368
/sys/kernel/slab/TCP/objs_per_slab:13
/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2496
/sys/kernel/slab/TCPv6/objs_per_slab:13
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sock.h | 11 ++++++++++-
net/core/sock.c | 16 ++++++++++------
net/ipv4/tcp_ipv4.c | 2 ++
net/ipv6/tcp_ipv6.c | 2 ++
4 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index aafe8bdb2c0f936bc3a179e394c2df6830419997..66b56288c1d3850439b2a0bed00be801d5770efa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -341,6 +341,7 @@ struct sk_filter;
* @sk_reuseport_cb: reuseport group container
* @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
+ * @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
@@ -582,7 +583,14 @@ struct sock {
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
struct numa_drop_counters *sk_drop_counters;
- struct rcu_head sk_rcu;
+ /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr.
+ * By the time kfree() is called, sk_rcu can not be in
+ * use and can be mangled.
+ */
+ union {
+ struct rcu_head sk_rcu;
+ freeptr_t sk_freeptr;
+ };
netns_tracker ns_tracker;
struct xarray sk_user_frags;
@@ -1368,6 +1376,7 @@ struct proto {
struct kmem_cache *slab;
unsigned int obj_size;
+ unsigned int freeptr_offset;
unsigned int ipv6_pinfo_offset;
slab_flags_t slab_flags;
unsigned int useroffset; /* Usercopy region offset */
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d5662b882dc0f9257c54ed312c383b4..693e6d80f501ef552aa58928f28b78a578169536 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
return -EINVAL;
}
if (alloc_slab) {
- prot->slab = kmem_cache_create_usercopy(prot->name,
- prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
- prot->slab_flags,
- prot->useroffset, prot->usersize,
- NULL);
+ struct kmem_cache_args args = {
+ .useroffset = prot->useroffset,
+ .usersize = prot->usersize,
+ .freeptr_offset = prot->freeptr_offset,
+ .use_freeptr_offset = !!prot->freeptr_offset,
+ };
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+ &args,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ffdf52fbf6463b41d7c712f3710b681ecdf6e2d7..0fc8a42921aabac27dcb7c6a9db811498edbb31c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3460,6 +3460,8 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .freeptr_offset = offsetof(struct tcp_sock,
+ inet_conn.icsk_inet.sk.sk_freeptr),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ae664b05fa9171ed996bf8f3b6e7b2aaa63d5c9..8bf29186c15f99dd2ab63d2b0b3890ed0c68d514 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2332,6 +2332,8 @@ struct proto tcpv6_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .freeptr_offset = offsetof(struct tcp6_sock,
+ tcp.inet_conn.icsk_inet.sk.sk_freeptr),
.ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
--
2.53.0.rc1.217.geba53bf80e-goog
next reply other threads:[~2026-01-29 15:35 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-29 15:34 Eric Dumazet [this message]
2026-01-31 1:40 ` [PATCH net-next] tcp: reduce tcp sockets size by one cache line patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260129153458.4163797-1-edumazet@google.com \
--to=edumazet@google.com \
--cc=davem@davemloft.net \
--cc=eric.dumazet@gmail.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox