* [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp
@ 2017-03-22 3:20 Subash Abhinov Kasiviswanathan
2017-03-22 21:19 ` David Miller
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Subash Abhinov Kasiviswanathan @ 2017-03-22 3:20 UTC (permalink / raw)
To: netdev, eric.dumazet
Cc: Subash Abhinov Kasiviswanathan, Stephen Hemminger, Tom Herbert
Certain system process significant unconnected UDP workload.
It would be preferrable to disable UDP early demux for those systems
and enable it for TCP only.
By disabling UDP demux, we see these slight gains on an ARM64 system-
782 -> 788Mbps unconnected single stream UDPv4
633 -> 654Mbps unconnected UDPv4 different sources
The performance impact can change based on CPU architecure and cache
sizes. There will not much difference seen if entire UDP hash table
is in cache.
Both sysctls are enabled by default to preserve existing behavior.
v1->v2: Change function pointer instead of adding conditional as
suggested by Stephen.
v2->v3: Read once in callers to avoid issues due to compiler
optimizations. Also update commit message with the tests.
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Tom Herbert <tom@herbertland.com>
---
Documentation/networking/ip-sysctl.txt | 11 +++++++-
include/net/netns/ipv4.h | 2 ++
include/net/tcp.h | 2 ++
include/net/udp.h | 3 +++
net/ipv4/af_inet.c | 22 ++++++++++++++--
net/ipv4/ip_input.c | 2 +-
net/ipv4/sysctl_net_ipv4.c | 48 ++++++++++++++++++++++++++++++++++
net/ipv6/ip6_input.c | 2 +-
net/ipv6/tcp_ipv6.c | 10 ++++++-
net/ipv6/udp.c | 10 ++++++-
10 files changed, 105 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ed3d079..6b921a1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -848,12 +848,21 @@ ip_dynaddr - BOOLEAN
ip_early_demux - BOOLEAN
Optimize input packet processing down to one demux for
certain kinds of local sockets. Currently we only do this
- for established TCP sockets.
+ for established TCP and connected UDP sockets.
It may add an additional cost for pure routing workloads that
reduces overall throughput, in such case you should disable it.
Default: 1
+tcp_early_demux - BOOLEAN
+ Enable early demux for established TCP sockets.
+ Default: 1
+
+udp_early_demux - BOOLEAN
+ Enable early demux for connected UDP sockets. Disable this if
+ your system could experience more unconnected load.
+ Default: 1
+
icmp_echo_ignore_all - BOOLEAN
If set non-zero, then the kernel will ignore all ICMP ECHO
requests sent to it.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2e9d649..a489b76 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -95,6 +95,8 @@ struct netns_ipv4 {
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
+ int sysctl_tcp_early_demux;
+ int sysctl_udp_early_demux;
int sysctl_fwmark_reflect;
int sysctl_tcp_fwmark_accept;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e614ad4..edc1df4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1932,4 +1932,6 @@ static inline void tcp_listendrop(const struct sock *sk)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}
+void tcp_v4_early_demux_configure(int enable);
+void tcp_v6_early_demux_configure(int enable);
#endif /* _TCP_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index c9d8b8e..33198fa 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -372,4 +372,7 @@ struct udp_iter_state {
#if IS_ENABLED(CONFIG_IPV6)
void udpv6_encap_enable(void);
#endif
+
+void udp_v4_early_demux_configure(int enable);
+void udp_v6_early_demux_configure(int enable);
#endif /* _UDP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6b1fc6e..d286750 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1599,7 +1599,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
};
#endif
-static const struct net_protocol tcp_protocol = {
+static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
@@ -1608,7 +1608,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
.icmp_strict_tag_validation = 1,
};
-static const struct net_protocol udp_protocol = {
+static struct net_protocol udp_protocol = {
.early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
@@ -1616,6 +1616,22 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
.netns_ok = 1,
};
+void tcp_v4_early_demux_configure(int enable)
+{
+ if (enable)
+ tcp_protocol.early_demux = tcp_v4_early_demux;
+ else
+ tcp_protocol.early_demux = NULL;
+}
+
+void udp_v4_early_demux_configure(int enable)
+{
+ if (enable)
+ udp_protocol.early_demux = udp_v4_early_demux;
+ else
+ udp_protocol.early_demux = NULL;
+}
+
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
@@ -1720,6 +1736,8 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
+ net->ipv4.sysctl_udp_early_demux = 1;
+ net->ipv4.sysctl_tcp_early_demux = 1;
#ifdef CONFIG_SYSCTL
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d6feabb..48d0fc8 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -329,7 +329,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && ipprot->early_demux) {
+ if (ipprot && READ_ONCE(ipprot->early_demux)) {
ipprot->early_demux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 11aaef0..86ad484 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -294,6 +294,40 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
return ret;
}
+static int proc_tcp_early_demux(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (write && !ret) {
+ int enabled = init_net.ipv4.sysctl_tcp_early_demux;
+
+ tcp_v4_early_demux_configure(enabled);
+ tcp_v6_early_demux_configure(enabled);
+ }
+
+ return ret;
+}
+
+static int proc_udp_early_demux(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (write && !ret) {
+ int enabled = init_net.ipv4.sysctl_udp_early_demux;
+
+ udp_v4_early_demux_configure(enabled);
+ udp_v6_early_demux_configure(enabled);
+ }
+
+ return ret;
+}
+
static struct ctl_table ipv4_table[] = {
{
.procname = "tcp_timestamps",
@@ -750,6 +784,20 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
.proc_handler = proc_dointvec
},
{
+ .procname = "udp_early_demux",
+ .data = &init_net.ipv4.sysctl_udp_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_udp_early_demux
+ },
+ {
+ .procname = "tcp_early_demux",
+ .data = &init_net.ipv4.sysctl_tcp_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tcp_early_demux
+ },
+ {
.procname = "ip_default_ttl",
.data = &init_net.ipv4.sysctl_ip_default_ttl,
.maxlen = sizeof(int),
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index aacfb4b..30d18cb 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -60,7 +60,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct inet6_protocol *ipprot;
ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
- if (ipprot && ipprot->early_demux)
+ if (ipprot && READ_ONCE(ipprot->early_demux))
ipprot->early_demux(skb);
}
if (!skb_valid_dst(skb))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f08d71..e26622f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1925,13 +1925,21 @@ struct proto tcpv6_prot = {
.diag_destroy = tcp_abort,
};
-static const struct inet6_protocol tcpv6_protocol = {
+static struct inet6_protocol tcpv6_protocol = {
.early_demux = tcp_v6_early_demux,
.handler = tcp_v6_rcv,
.err_handler = tcp_v6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+void tcp_v6_early_demux_configure(int enable)
+{
+ if (enable)
+ tcpv6_protocol.early_demux = tcp_v6_early_demux;
+ else
+ tcpv6_protocol.early_demux = NULL;
+}
+
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 08a188f..7178a18 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1436,13 +1436,21 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
}
#endif
-static const struct inet6_protocol udpv6_protocol = {
+static struct inet6_protocol udpv6_protocol = {
.early_demux = udp_v6_early_demux,
.handler = udpv6_rcv,
.err_handler = udpv6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+void udp_v6_early_demux_configure(int enable)
+{
+ if (enable)
+ udpv6_protocol.early_demux = udp_v6_early_demux;
+ else
+ udpv6_protocol.early_demux = NULL;
+}
+
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
int udp6_seq_show(struct seq_file *seq, void *v)
--
1.9.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp
2017-03-22 3:20 [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp Subash Abhinov Kasiviswanathan
@ 2017-03-22 21:19 ` David Miller
2017-03-23 11:49 ` kbuild test robot
2017-03-23 12:52 ` kbuild test robot
2 siblings, 0 replies; 4+ messages in thread
From: David Miller @ 2017-03-22 21:19 UTC (permalink / raw)
To: subashab; +Cc: netdev, eric.dumazet, stephen, tom
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 21 Mar 2017 21:20:10 -0600
> @@ -329,7 +329,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
> int protocol = iph->protocol;
>
> ipprot = rcu_dereference(inet_protos[protocol]);
> - if (ipprot && ipprot->early_demux) {
> + if (ipprot && READ_ONCE(ipprot->early_demux)) {
> ipprot->early_demux(skb);
I think you need to use a local variable for the function pointer in conjunction
with READ_ONCE() for this to work properly:
if (ipprot && (func = READ_ONCE(ipprot->early_demux))) {
func(skb);
...
> diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
> index aacfb4b..30d18cb 100644
> --- a/net/ipv6/ip6_input.c
> +++ b/net/ipv6/ip6_input.c
> @@ -60,7 +60,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
> const struct inet6_protocol *ipprot;
>
> ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
> - if (ipprot && ipprot->early_demux)
> + if (ipprot && READ_ONCE(ipprot->early_demux))
> ipprot->early_demux(skb);
> }
> if (!skb_valid_dst(skb))
Likewise.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp
2017-03-22 3:20 [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp Subash Abhinov Kasiviswanathan
2017-03-22 21:19 ` David Miller
@ 2017-03-23 11:49 ` kbuild test robot
2017-03-23 12:52 ` kbuild test robot
2 siblings, 0 replies; 4+ messages in thread
From: kbuild test robot @ 2017-03-23 11:49 UTC (permalink / raw)
To: Subash Abhinov Kasiviswanathan
Cc: kbuild-all, netdev, eric.dumazet, Subash Abhinov Kasiviswanathan,
Stephen Hemminger, Tom Herbert
[-- Attachment #1: Type: text/plain, Size: 884 bytes --]
Hi Subash,
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170323-182822
config: x86_64-kexec (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All errors (new ones prefixed by >>):
net/built-in.o: In function `proc_tcp_early_demux':
sysctl_net_ipv4.c:(.text+0x7fe04): undefined reference to `tcp_v6_early_demux_configure'
net/built-in.o: In function `proc_udp_early_demux':
>> sysctl_net_ipv4.c:(.text+0x7fe3d): undefined reference to `udp_v6_early_demux_configure'
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 25154 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp
2017-03-22 3:20 [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp Subash Abhinov Kasiviswanathan
2017-03-22 21:19 ` David Miller
2017-03-23 11:49 ` kbuild test robot
@ 2017-03-23 12:52 ` kbuild test robot
2 siblings, 0 replies; 4+ messages in thread
From: kbuild test robot @ 2017-03-23 12:52 UTC (permalink / raw)
To: Subash Abhinov Kasiviswanathan
Cc: kbuild-all, netdev, eric.dumazet, Subash Abhinov Kasiviswanathan,
Stephen Hemminger, Tom Herbert
[-- Attachment #1: Type: text/plain, Size: 2052 bytes --]
Hi Subash,
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-Add-sysctl-to-toggle-early-demux-for-tcp-and-udp/20170323-182822
config: alpha-defconfig (attached as .config)
compiler: alpha-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=alpha
All errors (new ones prefixed by >>):
net/built-in.o: In function `proc_tcp_early_demux':
net/ipv4/sysctl_net_ipv4.c:308: undefined reference to `tcp_v6_early_demux_configure'
net/ipv4/sysctl_net_ipv4.c:308: undefined reference to `tcp_v6_early_demux_configure'
net/built-in.o: In function `proc_udp_early_demux':
>> net/ipv4/sysctl_net_ipv4.c:325: undefined reference to `udp_v6_early_demux_configure'
>> net/ipv4/sysctl_net_ipv4.c:325: undefined reference to `udp_v6_early_demux_configure'
vim +325 net/ipv4/sysctl_net_ipv4.c
302 ret = proc_dointvec(table, write, buffer, lenp, ppos);
303
304 if (write && !ret) {
305 int enabled = init_net.ipv4.sysctl_tcp_early_demux;
306
307 tcp_v4_early_demux_configure(enabled);
> 308 tcp_v6_early_demux_configure(enabled);
309 }
310
311 return ret;
312 }
313
314 static int proc_udp_early_demux(struct ctl_table *table, int write,
315 void __user *buffer, size_t *lenp, loff_t *ppos)
316 {
317 int ret = 0;
318
319 ret = proc_dointvec(table, write, buffer, lenp, ppos);
320
321 if (write && !ret) {
322 int enabled = init_net.ipv4.sysctl_udp_early_demux;
323
324 udp_v4_early_demux_configure(enabled);
> 325 udp_v6_early_demux_configure(enabled);
326 }
327
328 return ret;
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 12361 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-03-23 12:52 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-03-22 3:20 [PATCH net-next v3] net: Add sysctl to toggle early demux for tcp and udp Subash Abhinov Kasiviswanathan
2017-03-22 21:19 ` David Miller
2017-03-23 11:49 ` kbuild test robot
2017-03-23 12:52 ` kbuild test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).