* [RFC PATCH] reuseport: compute the ehash only if needed
@ 2017-12-12 13:09 Paolo Abeni
2017-12-13 20:08 ` David Miller
0 siblings, 1 reply; 6+ messages in thread
From: Paolo Abeni @ 2017-12-12 13:09 UTC (permalink / raw)
To: netdev; +Cc: David S. Miller, Craig Gallek, Eric Dumazet
When a reuseport socket group is using a BPF filter to distribute
the packets among the sockets, we don't need to compute any hash
value, but the current reuseport_select_sock() requires the
caller to compute such hash in advance.
This patch reworks reuseport_select_sock() to compute the hash value
only if needed - missing or failing BPF filter. Since different
hash functions have different argument types - ipv4 addresses vs ipv6
ones - to avoid over-complicate the interface, reuseport_select_sock()
is now a macro.
Additionally, the sk_reuseport test is move inside reuseport_select_sock,
to avoid some code duplication.
Overall this gives small but measurable performance improvement
under UDP flood while using SO_REUSEPORT + BPF.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/net/sock_reuseport.h | 32 ++++++++++++++++++++++++++++----
net/core/sock_reuseport.c | 34 +++++++++++++++-------------------
net/ipv4/inet_hashtables.c | 28 ++++++++++------------------
net/ipv4/udp.c | 30 ++++++++++++------------------
net/ipv6/inet6_hashtables.c | 28 ++++++++++------------------
net/ipv6/udp.c | 31 ++++++++++++-------------------
6 files changed, 87 insertions(+), 96 deletions(-)
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0054b3a9b923..e7d71d22dca7 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -16,13 +16,37 @@ struct sock_reuseport {
struct sock *socks[0]; /* array of sock pointers */
};
+struct reuseport_info {
+ struct sock_reuseport *reuse;
+ struct sock *sk;
+ u16 socks;
+};
+
extern int reuseport_alloc(struct sock *sk);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
extern void reuseport_detach_sock(struct sock *sk);
-extern struct sock *reuseport_select_sock(struct sock *sk,
- u32 hash,
- struct sk_buff *skb,
- int hdr_len);
+bool __reuseport_get_info(struct sock *sk, struct sk_buff *skb, int hdr_len,
+ struct reuseport_info *info);
+static inline struct sock *__reuseport_select_sock(struct reuseport_info *info,
+ u32 hash)
+{
+ return info->reuse->socks[reciprocal_scale(hash, info->socks)];
+}
+
+#define reuseport_select_sock(sk, skb, net, hlen, fn, saddr, sport, daddr, dport) \
+({ \
+ struct reuseport_info info; \
+ info.sk = NULL; \
+ if (sk->sk_reuseport) { \
+ rcu_read_lock(); \
+ if (__reuseport_get_info(sk, skb, hlen, &info) && !info.sk) \
+ info.sk = __reuseport_select_sock(&info, \
+ fn(net, daddr, hnum, saddr, sport)); \
+ rcu_read_unlock(); \
+ } \
+ info.sk; \
+})
+
extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
struct bpf_prog *prog);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index c5bb52bc73a1..8d66e66239a2 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -201,31 +201,30 @@ static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
}
/**
- * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ * __reuseport_get_info - Retrieve information for reuseport socket selection
* @sk: First socket in the group.
- * @hash: When no BPF filter is available, use this hash to select.
* @skb: skb to run through BPF filter.
* @hdr_len: BPF filter expects skb data pointer at payload data. If
* the skb does not yet point at the payload, this parameter represents
* how far the pointer needs to advance to reach the payload.
- * Returns a socket that should receive the packet (or NULL on error).
+ * @info: reuseport information, filled only if return value is true
+ * Returns true if @sk is a reuseport socket, and fill @info accordingly.
+ * if @info.sk is NULL, the caller must retrieve the selected reuseport socket
+ * calling __reuseport_select_sock(). The caller must hold the RCU lock.
*/
-struct sock *reuseport_select_sock(struct sock *sk,
- u32 hash,
- struct sk_buff *skb,
- int hdr_len)
+bool __reuseport_get_info(struct sock *sk, struct sk_buff *skb, int hdr_len,
+ struct reuseport_info *info)
{
struct sock_reuseport *reuse;
struct bpf_prog *prog;
- struct sock *sk2 = NULL;
u16 socks;
- rcu_read_lock();
+ info->sk = NULL;
reuse = rcu_dereference(sk->sk_reuseport_cb);
/* if memory allocation failed or add call is not yet complete */
if (!reuse)
- goto out;
+ return false;
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
@@ -234,18 +233,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
smp_rmb();
if (prog && skb)
- sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ info->sk = run_bpf(reuse, socks, prog, skb, hdr_len);
- /* no bpf or invalid bpf result: fall back to hash usage */
- if (!sk2)
- sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ info->reuse = reuse;
+ info->socks = socks;
+ return true;
}
-
-out:
- rcu_read_unlock();
- return sk2;
+ return false;
}
-EXPORT_SYMBOL(reuseport_select_sock);
+EXPORT_SYMBOL(__reuseport_get_info);
struct bpf_prog *
reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f6f58108b4c5..eed48aab05f5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -273,21 +273,17 @@ static struct sock *inet_lhash2_lookup(struct net *net,
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
- u32 phash = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
score = compute_score(sk, net, hnum, daddr,
dif, sdif, exact_dif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net, doff,
+ inet_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
hiscore = score;
}
@@ -310,7 +306,6 @@ struct sock *__inet_lookup_listener(struct net *net,
struct sock *sk, *result = NULL;
int score, hiscore = 0;
unsigned int hash2;
- u32 phash = 0;
if (ilb->count <= 10 || !hashinfo->lhash2)
goto port_lookup;
@@ -346,14 +341,11 @@ struct sock *__inet_lookup_listener(struct net *net,
score = compute_score(sk, net, hnum, daddr,
dif, sdif, exact_dif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net, doff,
+ inet_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
hiscore = score;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9c0d1e1772e..8072755bb5fc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -440,7 +440,6 @@ static struct sock *udp4_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
int score, badness;
- u32 hash = 0;
result = NULL;
badness = 0;
@@ -448,14 +447,12 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net,
+ sizeof(struct udphdr),
+ udp_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
badness = score;
result = sk;
}
@@ -476,7 +473,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
bool exact_dif = udp_lib_exact_dif_match(net, skb);
int score, badness;
- u32 hash = 0;
if (hslot->count > 10) {
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
@@ -513,14 +509,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net,
+ sizeof(struct udphdr),
+ udp_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
badness = score;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2febe26de6a1..f6167e647672 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -136,21 +136,17 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
- u32 phash = 0;
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
sk = (struct sock *)icsk;
score = compute_score(sk, net, hnum, daddr, dif, sdif,
exact_dif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net, doff,
+ inet6_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
hiscore = score;
}
@@ -173,7 +169,6 @@ struct sock *inet6_lookup_listener(struct net *net,
struct sock *sk, *result = NULL;
int score, hiscore = 0;
unsigned int hash2;
- u32 phash = 0;
if (ilb->count <= 10 || !hashinfo->lhash2)
goto port_lookup;
@@ -208,14 +203,11 @@ struct sock *inet6_lookup_listener(struct net *net,
sk_for_each(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net, doff,
+ inet6_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
hiscore = score;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eecf9f0faf29..936c2a5c7147 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -169,7 +169,6 @@ static struct sock *udp6_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
int score, badness;
- u32 hash = 0;
result = NULL;
badness = -1;
@@ -177,15 +176,12 @@ static struct sock *udp6_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
-
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net,
+ sizeof(struct udphdr),
+ udp6_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
badness = score;
}
@@ -206,7 +202,6 @@ struct sock *__udp6_lib_lookup(struct net *net,
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
bool exact_dif = udp6_lib_exact_dif_match(net, skb);
int score, badness;
- u32 hash = 0;
if (hslot->count > 10) {
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
@@ -244,14 +239,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
sdif, exact_dif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
+ result = reuseport_select_sock(sk, skb, net,
+ sizeof(struct udphdr),
+ udp6_ehashfn, saddr,
+ sport, daddr, hnum);
+ if (result)
+ return result;
result = sk;
badness = score;
}
--
2.14.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] reuseport: compute the ehash only if needed
@ 2017-12-12 17:44 Craig Gallek
2017-12-12 18:25 ` Paolo Abeni
0 siblings, 1 reply; 6+ messages in thread
From: Craig Gallek @ 2017-12-12 17:44 UTC (permalink / raw)
To: Paolo Abeni; +Cc: netdev, David S. Miller, Eric Dumazet
On Tue, Dec 12, 2017 at 8:09 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> When a reuseport socket group is using a BPF filter to distribute
> the packets among the sockets, we don't need to compute any hash
> value, but the current reuseport_select_sock() requires the
> caller to compute such hash in advance.
>
> This patch reworks reuseport_select_sock() to compute the hash value
> only if needed - missing or failing BPF filter. Since different
> hash functions have different argument types - ipv4 addresses vs ipv6
> ones - to avoid over-complicate the interface, reuseport_select_sock()
> is now a macro.
Purely subjective, but I think a slightly more complicated function
signature for reuseport_select_sock (and reuseport_select_sock6?)
would look a little better than this macro. It would avoid needing to
expose the reuseport_info struct and would keep the rcu semantics
entirely within the function call (the fast-path memory access
semantics here are already non-trivial...)
> Additionally, the sk_reuseport test is move inside reuseport_select_sock,
> to avoid some code duplication.
>
> Overall this gives small but measurable performance improvement
> under UDP flood while using SO_REUSEPORT + BPF.
Exciting, do you have some specific numbers here? I'd be interested
in knowing what kinds of loads you end up seeing improvements for.
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] reuseport: compute the ehash only if needed
2017-12-12 17:44 [RFC PATCH] reuseport: compute the ehash only if needed Craig Gallek
@ 2017-12-12 18:25 ` Paolo Abeni
0 siblings, 0 replies; 6+ messages in thread
From: Paolo Abeni @ 2017-12-12 18:25 UTC (permalink / raw)
To: Craig Gallek; +Cc: netdev, David S. Miller, Eric Dumazet
Hi,
On Tue, 2017-12-12 at 12:44 -0500, Craig Gallek wrote:
> On Tue, Dec 12, 2017 at 8:09 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> > When a reuseport socket group is using a BPF filter to distribute
> > the packets among the sockets, we don't need to compute any hash
> > value, but the current reuseport_select_sock() requires the
> > caller to compute such hash in advance.
> >
> > This patch reworks reuseport_select_sock() to compute the hash value
> > only if needed - missing or failing BPF filter. Since different
> > hash functions have different argument types - ipv4 addresses vs ipv6
> > ones - to avoid over-complicate the interface, reuseport_select_sock()
> > is now a macro.
>
> Purely subjective, but I think a slightly more complicated function
> signature for reuseport_select_sock (and reuseport_select_sock6?)
> would look a little better than this macro. It would avoid needing to
> expose the reuseport_info struct and would keep the rcu semantics
> entirely within the function call (the fast-path memory access
> semantics here are already non-trivial...)
Thanks for the feedback.
I was in doubt about the macro, too. The downside of using explicit
functions is the very long argument list and the need of 2 separate
functions for ipv4 and ipv6.
> > Additionally, the sk_reuseport test is move inside reuseport_select_sock,
> > to avoid some code duplication.
> >
> > Overall this gives small but measurable performance improvement
> > under UDP flood while using SO_REUSEPORT + BPF.
>
> Exciting, do you have some specific numbers here? I'd be interested
> in knowing what kinds of loads you end up seeing improvements for.
this are the numbers I collected so far:
(ipv4)
socks nr vanilla(kpps) patched(kpps)
1 1747 1843
2 3109 3140
3 4480 4534
4 5796 5864
5 7063 7139
6 8168 8235
(ipv6)
socks nr vanilla(kpps) patched(kpps)
1 1433 1544
2 2537 2731
3 3622 3794
4 4689 4979
5 5738 6011
6 6671 6920
Cheers,
Paolo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] reuseport: compute the ehash only if needed
2017-12-12 13:09 Paolo Abeni
@ 2017-12-13 20:08 ` David Miller
2017-12-14 8:29 ` Paolo Abeni
0 siblings, 1 reply; 6+ messages in thread
From: David Miller @ 2017-12-13 20:08 UTC (permalink / raw)
To: pabeni; +Cc: netdev, kraig, edumazet
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 12 Dec 2017 14:09:28 +0100
> When a reuseport socket group is using a BPF filter to distribute
> the packets among the sockets, we don't need to compute any hash
> value, but the current reuseport_select_sock() requires the
> caller to compute such hash in advance.
>
> This patch reworks reuseport_select_sock() to compute the hash value
> only if needed - missing or failing BPF filter. Since different
> hash functions have different argument types - ipv4 addresses vs ipv6
> ones - to avoid over-complicate the interface, reuseport_select_sock()
> is now a macro.
>
> Additionally, the sk_reuseport test is move inside reuseport_select_sock,
> to avoid some code duplication.
>
> Overall this gives small but measurable performance improvement
> under UDP flood while using SO_REUSEPORT + BPF.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
I don't doubt that this improves the case where the hash is elided, but
I suspect it makes things slower othewise.
You're doing two function calls for an operation that used to require
just one in the bottom of the call chain.
You're also putting something onto the stack that the compiler can't
possibly optimize into purely using cpu registers to hold.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] reuseport: compute the ehash only if needed
2017-12-13 20:08 ` David Miller
@ 2017-12-14 8:29 ` Paolo Abeni
2017-12-14 13:41 ` David Miller
0 siblings, 1 reply; 6+ messages in thread
From: Paolo Abeni @ 2017-12-14 8:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev, kraig, edumazet
Hi,
On Wed, 2017-12-13 at 15:08 -0500, David Miller wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Tue, 12 Dec 2017 14:09:28 +0100
>
> > When a reuseport socket group is using a BPF filter to distribute
> > the packets among the sockets, we don't need to compute any hash
> > value, but the current reuseport_select_sock() requires the
> > caller to compute such hash in advance.
> >
> > This patch reworks reuseport_select_sock() to compute the hash value
> > only if needed - missing or failing BPF filter. Since different
> > hash functions have different argument types - ipv4 addresses vs ipv6
> > ones - to avoid over-complicate the interface, reuseport_select_sock()
> > is now a macro.
> >
> > Additionally, the sk_reuseport test is move inside reuseport_select_sock,
> > to avoid some code duplication.
> >
> > Overall this gives small but measurable performance improvement
> > under UDP flood while using SO_REUSEPORT + BPF.
> >
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>
> I don't doubt that this improves the case where the hash is elided, but
> I suspect it makes things slower othewise.
>
> You're doing two function calls for an operation that used to require
> just one in the bottom of the call chain.
>
> You're also putting something onto the stack that the compiler can't
> possibly optimize into purely using cpu registers to hold.
Thank you for the feedback.
I was unable to measure any performance regression for the hash based
demultiplexing, and I think that the number of function calls is
unchanged in such scenario (with vanilla kernel we have ehash() and
reuseport_select_sock(), with the patched one __reuseport_get_info()
and ehash()).
I agree you are right about the additional stack usage introduced by
this patch.
Overall I see we need something better than this.
Thanks,
Paolo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC PATCH] reuseport: compute the ehash only if needed
2017-12-14 8:29 ` Paolo Abeni
@ 2017-12-14 13:41 ` David Miller
0 siblings, 0 replies; 6+ messages in thread
From: David Miller @ 2017-12-14 13:41 UTC (permalink / raw)
To: pabeni; +Cc: netdev, kraig, edumazet
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 14 Dec 2017 09:29:46 +0100
> I was unable to measure any performance regression for the hash based
> demultiplexing, and I think that the number of function calls is
> unchanged in such scenario (with vanilla kernel we have ehash() and
> reuseport_select_sock(), with the patched one __reuseport_get_info()
> and ehash()).
>
> I agree you are right about the additional stack usage introduced by
> this patch.
>
> Overall I see we need something better than this.
Thanks for checking whether it's slower or not. I wonder if x86-64
is putting the argument parts on the stack anyways...
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2017-12-14 13:42 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-12 17:44 [RFC PATCH] reuseport: compute the ehash only if needed Craig Gallek
2017-12-12 18:25 ` Paolo Abeni
-- strict thread matches above, loose matches on Subject: below --
2017-12-12 13:09 Paolo Abeni
2017-12-13 20:08 ` David Miller
2017-12-14 8:29 ` Paolo Abeni
2017-12-14 13:41 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).