* [PATCH 4.14] tcp/udp: Make early_demux back namespacified.
@ 2022-11-03 23:50 Kuniyuki Iwashima
2022-11-07 15:17 ` Greg KH
0 siblings, 1 reply; 2+ messages in thread
From: Kuniyuki Iwashima @ 2022-11-03 23:50 UTC (permalink / raw)
To: stable; +Cc: Kuniyuki Iwashima, Jakub Kicinski
commit 11052589cf5c0bab3b4884d423d5f60c38fcf25d upstream
Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis. Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp"). However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.
We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic. Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.
This patch namespacifies (tcp|udp)_early_demux again. For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline. So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.
Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
include/net/protocol.h | 4 ---
include/net/tcp.h | 2 ++
include/net/udp.h | 1 +
net/ipv4/af_inet.c | 14 ++-------
net/ipv4/ip_input.c | 32 ++++++++++++++-------
net/ipv4/sysctl_net_ipv4.c | 59 ++------------------------------------
net/ipv6/ip6_input.c | 23 +++++++++------
net/ipv6/tcp_ipv6.c | 9 ++----
net/ipv6/udp.c | 9 ++----
9 files changed, 47 insertions(+), 106 deletions(-)
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 4fc75f7ae23b..312a27393e0b 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -39,8 +39,6 @@
/* This is used to register protocols. */
struct net_protocol {
- int (*early_demux)(struct sk_buff *skb);
- int (*early_demux_handler)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
@@ -54,8 +52,6 @@ struct net_protocol {
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_protocol {
- void (*early_demux)(struct sk_buff *skb);
- void (*early_demux_handler)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 003638f73ffa..4f97c0e2d5f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -890,6 +890,8 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb)
#endif
return 0;
}
+
+void tcp_v6_early_demux(struct sk_buff *skb);
#endif
static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
diff --git a/include/net/udp.h b/include/net/udp.h
index 18391015233e..07135de00166 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -173,6 +173,7 @@ typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
+void udp_v6_early_demux(struct sk_buff *skb);
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 06ad21597d68..1eee002b12d2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1608,12 +1608,7 @@ static const struct net_protocol igmp_protocol = {
};
#endif
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
- .early_demux = tcp_v4_early_demux,
- .early_demux_handler = tcp_v4_early_demux,
+static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
@@ -1621,12 +1616,7 @@ static struct net_protocol tcp_protocol = {
.icmp_strict_tag_validation = 1,
};
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
- .early_demux = udp_v4_early_demux,
- .early_demux_handler = udp_v4_early_demux,
+static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6fc45d3a1f8a..155476df8f9a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,10 +307,11 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
return true;
}
+int udp_v4_early_demux(struct sk_buff *);
+int tcp_v4_early_demux(struct sk_buff *);
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
- int (*edemux)(struct sk_buff *skb);
struct net_device *dev = skb->dev;
struct rtable *rt;
int err;
@@ -322,20 +323,29 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- if (net->ipv4.sysctl_ip_early_demux &&
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
- const struct net_protocol *ipprot;
- int protocol = iph->protocol;
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+ tcp_v4_early_demux(skb);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
- err = edemux(skb);
- if (unlikely(err))
- goto drop_error;
- /* must reload iph, skb->head might have changed */
- iph = ip_hdr(skb);
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+ err = udp_v4_early_demux(skb);
+ if (unlikely(err))
+ goto drop_error;
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
}
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 78771272f613..df118d5b1e28 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -311,61 +311,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
return ret;
}
-static void proc_configure_early_demux(int enabled, int protocol)
-{
- struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_protocol *ip6prot;
-#endif
-
- rcu_read_lock();
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot)
- ipprot->early_demux = enabled ? ipprot->early_demux_handler :
- NULL;
-
-#if IS_ENABLED(CONFIG_IPV6)
- ip6prot = rcu_dereference(inet6_protos[protocol]);
- if (ip6prot)
- ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
- NULL;
-#endif
- rcu_read_unlock();
-}
-
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret = 0;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_tcp_early_demux;
-
- proc_configure_early_demux(enabled, IPPROTO_TCP);
- }
-
- return ret;
-}
-
-static int proc_udp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret = 0;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_udp_early_demux;
-
- proc_configure_early_demux(enabled, IPPROTO_UDP);
- }
-
- return ret;
-}
-
static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
int write,
void __user *buffer,
@@ -853,14 +798,14 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_udp_early_demux,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_udp_early_demux
+ .proc_handler = proc_douintvec,
},
{
.procname = "tcp_early_demux",
.data = &init_net.ipv4.sysctl_tcp_early_demux,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_tcp_early_demux
+ .proc_handler = proc_douintvec,
},
{
.procname = "ip_default_ttl",
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..62b5dac7be30 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,10 +47,10 @@
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>
+void udp_v6_early_demux(struct sk_buff *);
+void tcp_v6_early_demux(struct sk_buff *);
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- void (*edemux)(struct sk_buff *skb);
-
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -58,13 +58,20 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
- const struct inet6_protocol *ipprot;
-
- ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
- edemux(skb);
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+ !skb_dst(skb) && !skb->sk) {
+ switch (ipv6_hdr(skb)->nexthdr) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+ tcp_v6_early_demux(skb);
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+ udp_v6_early_demux(skb);
+ break;
+ }
}
+
if (!skb_valid_dst(skb))
ip6_route_input(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 711e5565e3a5..4ef55062d37c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1635,7 +1635,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
goto discard_it;
}
-static void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
{
const struct ipv6hdr *hdr;
const struct tcphdr *th;
@@ -1991,12 +1991,7 @@ struct proto tcpv6_prot = {
.diag_destroy = tcp_abort,
};
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
- .early_demux = tcp_v6_early_demux,
- .early_demux_handler = tcp_v6_early_demux,
+static const struct inet6_protocol tcpv6_protocol = {
.handler = tcp_v6_rcv,
.err_handler = tcp_v6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 685efa6a8c32..0d4f82f9ebfd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -932,7 +932,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
return NULL;
}
-static void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
const struct udphdr *uh;
@@ -1491,12 +1491,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
}
#endif
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
- .early_demux = udp_v6_early_demux,
- .early_demux_handler = udp_v6_early_demux,
+static const struct inet6_protocol udpv6_protocol = {
.handler = udpv6_rcv,
.err_handler = udpv6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
--
2.30.2
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH 4.14] tcp/udp: Make early_demux back namespacified.
2022-11-03 23:50 [PATCH 4.14] tcp/udp: Make early_demux back namespacified Kuniyuki Iwashima
@ 2022-11-07 15:17 ` Greg KH
0 siblings, 0 replies; 2+ messages in thread
From: Greg KH @ 2022-11-07 15:17 UTC (permalink / raw)
To: Kuniyuki Iwashima; +Cc: stable, Jakub Kicinski
On Thu, Nov 03, 2022 at 04:50:33PM -0700, Kuniyuki Iwashima wrote:
> commit 11052589cf5c0bab3b4884d423d5f60c38fcf25d upstream
>
> Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
> it possible to enable/disable early_demux on a per-netns basis. Then, we
> introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
> TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
> tcp and udp"). However, the .proc_handler() was wrong and actually
> disabled us from changing the behaviour in each netns.
>
> We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
> .early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux,
> the change itself is saved in each netns variable, but the .early_demux()
> handler is a global variable, so the handler is switched based on the
> init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have
> nothing to do with the logic. Whether we CAN execute proto .early_demux()
> is always decided by init_net's sysctl knob, and whether we DO it or not is
> by each netns ip_early_demux knob.
>
> This patch namespacifies (tcp|udp)_early_demux again. For now, the users
> of the .early_demux() handler are TCP and UDP only, and they are called
> directly to avoid retpoline. So, we can remove the .early_demux() handler
> from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
> If another proto needs .early_demux(), we can restore it at that time.
>
> Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
All now queued up, thanks.
greg k-h
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-11-07 15:17 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-11-03 23:50 [PATCH 4.14] tcp/udp: Make early_demux back namespacified Kuniyuki Iwashima
2022-11-07 15:17 ` Greg KH
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).