* [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-21 9:11 ` Alexander Lobakin
` (3 more replies)
2024-06-20 22:19 ` [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag Yan Zhai
` (7 subsequent siblings)
8 siblings, 4 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Willem de Bruijn, Simon Horman, Florian Westphal,
Mina Almasry, Abhishek Chauhan, David Howells, Alexander Lobakin,
David Ahern, Richard Gobert, Antoine Tenart, Yan Zhai,
Felix Fietkau, Soheil Hassas Yeganeh, Pavel Begunkov,
Lorenzo Bianconi, Thomas Weißschuh, netdev, linux-kernel,
bpf
Software GRO is currently controlled by a single switch, i.e.
ethtool -K dev gro on|off
However, this is not always desired. When GRO is enabled, even if the
kernel cannot GRO certain traffic, it has to run through the GRO receive
handlers with no benefit.
There are also scenarios that turning off GRO is a requirement. For
example, our production environment has a scenario that a TC egress hook
may add multiple encapsulation headers to forwarded skbs for load
balancing and isolation purpose. The encapsulation is implemented via
BPF. But the problem arises then: there is no way to properly offload a
double-encapsulated packet, since skb only has network_header and
inner_network_header to track one layer of encapsulation, but not two.
On the other hand, not all the traffic through this device needs double
encapsulation. But we have to turn off GRO completely for any ingress
device as a result.
Introduce a bit on skb so that GRO engine can be notified to skip GRO on
this skb, rather than having to be 0-or-1 for all traffic.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
include/linux/netdevice.h | 9 +++++++--
include/linux/skbuff.h | 10 ++++++++++
net/Kconfig | 10 ++++++++++
net/core/gro.c | 2 +-
net/core/gro_cells.c | 2 +-
net/core/skbuff.c | 4 ++++
6 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c83b390191d4..2ca0870b1221 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2415,11 +2415,16 @@ struct net_device {
((dev)->devlink_port = (port)); \
})
-static inline bool netif_elide_gro(const struct net_device *dev)
+static inline bool netif_elide_gro(const struct sk_buff *skb)
{
- if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
+ if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
return true;
+
+#ifdef CONFIG_SKB_GRO_CONTROL
+ return skb->gro_disabled;
+#else
return false;
+#endif
}
#define NETDEV_ALIGN 32
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f4cda3fbdb75..48b10ece95b5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1008,6 +1008,9 @@ struct sk_buff {
#if IS_ENABLED(CONFIG_IP_SCTP)
__u8 csum_not_inet:1;
#endif
+#ifdef CONFIG_SKB_GRO_CONTROL
+ __u8 gro_disabled:1;
+#endif
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
__u16 tc_index; /* traffic control index */
@@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
#endif
}
+static inline void skb_disable_gro(struct sk_buff *skb)
+{
+#ifdef CONFIG_SKB_GRO_CONTROL
+ skb->gro_disabled = 1;
+#endif
+}
+
/**
* skb_unref - decrement the skb's reference count
* @skb: buffer
diff --git a/net/Kconfig b/net/Kconfig
index 9fe65fa26e48..47d1ee92df15 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
and in drivers using build_skb().
If unsure, say 17.
+config SKB_GRO_CONTROL
+ bool "allow disable GRO on per-packet basis"
+ default y
+ help
+ By default GRO can only be enabled or disabled per network device.
+ This can be cumbersome for certain scenarios.
+ Toggling this option will allow disabling GRO for selected packets,
+ e.g. by XDP programs, so that it is more flexibile.
+ Extra overhead should be minimal.
+
config RPS
bool "Receive packet steering"
depends on SMP && SYSFS
diff --git a/net/core/gro.c b/net/core/gro.c
index b3b43de1a650..46232a0d1983 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -476,7 +476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
enum gro_result ret;
int same_flow;
- if (netif_elide_gro(skb->dev))
+ if (netif_elide_gro(skb))
goto normal;
gro_list_prepare(&gro_list->list, skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index ff8e5b64bf6b..1bf15783300f 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -20,7 +20,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
if (unlikely(!(dev->flags & IFF_UP)))
goto drop;
- if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
+ if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
res = netif_rx(skb);
goto unlock;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2315c088e91d..82bd297921c1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6030,6 +6030,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
+#ifdef CONFIG_SKB_GRO_CONTROL
+ /* hand back GRO control to next netns */
+ skb->gro_disabled = 0;
+#endif
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
@ 2024-06-21 9:11 ` Alexander Lobakin
2024-06-21 15:40 ` Yan Zhai
2024-06-21 9:49 ` Paolo Abeni
` (2 subsequent siblings)
3 siblings, 1 reply; 33+ messages in thread
From: Alexander Lobakin @ 2024-06-21 9:11 UTC (permalink / raw)
To: Yan Zhai
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, David Ahern, Richard Gobert, Antoine Tenart,
Felix Fietkau, Soheil Hassas Yeganeh, Pavel Begunkov,
Lorenzo Bianconi, Thomas Weißschuh, linux-kernel, bpf
From: Yan Zhai <yan@cloudflare.com>
Date: Thu, 20 Jun 2024 15:19:10 -0700
> Software GRO is currently controlled by a single switch, i.e.
>
> ethtool -K dev gro on|off
>
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
>
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
Implement it in the kernel then? :D
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
>
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
>
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
> include/linux/netdevice.h | 9 +++++++--
> include/linux/skbuff.h | 10 ++++++++++
> net/Kconfig | 10 ++++++++++
> net/core/gro.c | 2 +-
> net/core/gro_cells.c | 2 +-
> net/core/skbuff.c | 4 ++++
> 6 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
> ((dev)->devlink_port = (port)); \
> })
>
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> {
> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + return skb->gro_disabled;
> +#else
> return false;
> +#endif
> }
>
> #define NETDEV_ALIGN 32
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f4cda3fbdb75..48b10ece95b5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1008,6 +1008,9 @@ struct sk_buff {
> #if IS_ENABLED(CONFIG_IP_SCTP)
> __u8 csum_not_inet:1;
> #endif
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + __u8 gro_disabled:1;
> +#endif
>
> #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
> __u16 tc_index; /* traffic control index */
> @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
> #endif
> }
>
> +static inline void skb_disable_gro(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + skb->gro_disabled = 1;
> +#endif
> +}
> +
> /**
> * skb_unref - decrement the skb's reference count
> * @skb: buffer
> diff --git a/net/Kconfig b/net/Kconfig
> index 9fe65fa26e48..47d1ee92df15 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
> and in drivers using build_skb().
> If unsure, say 17.
>
> +config SKB_GRO_CONTROL
> + bool "allow disable GRO on per-packet basis"
> + default y
> + help
> + By default GRO can only be enabled or disabled per network device.
> + This can be cumbersome for certain scenarios.
> + Toggling this option will allow disabling GRO for selected packets,
> + e.g. by XDP programs, so that it is more flexibile.
> + Extra overhead should be minimal.
I don't think we need a Kconfig option for that. Can't it be
unconditional? Is there any real eye-visible overhead?
> +
> config RPS
> bool "Receive packet steering"
> depends on SMP && SYSFS
> diff --git a/net/core/gro.c b/net/core/gro.c
> index b3b43de1a650..46232a0d1983 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -476,7 +476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> enum gro_result ret;
> int same_flow;
>
> - if (netif_elide_gro(skb->dev))
> + if (netif_elide_gro(skb))
> goto normal;
>
> gro_list_prepare(&gro_list->list, skb);
> diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
> index ff8e5b64bf6b..1bf15783300f 100644
> --- a/net/core/gro_cells.c
> +++ b/net/core/gro_cells.c
> @@ -20,7 +20,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
> if (unlikely(!(dev->flags & IFF_UP)))
> goto drop;
>
> - if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
> + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
> res = netif_rx(skb);
> goto unlock;
> }
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2315c088e91d..82bd297921c1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6030,6 +6030,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
> ipvs_reset(skb);
> skb->mark = 0;
> skb_clear_tstamp(skb);
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + /* hand back GRO control to next netns */
> + skb->gro_disabled = 0;
> +#endif
> }
> EXPORT_SYMBOL_GPL(skb_scrub_packet);
Thanks,
Olek
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 9:11 ` Alexander Lobakin
@ 2024-06-21 15:40 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 15:40 UTC (permalink / raw)
To: Alexander Lobakin
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, David Ahern, Richard Gobert, Antoine Tenart,
Felix Fietkau, Soheil Hassas Yeganeh, Pavel Begunkov,
Lorenzo Bianconi, Thomas Weißschuh, linux-kernel, bpf
On Fri, Jun 21, 2024 at 4:13 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> From: Yan Zhai <yan@cloudflare.com>
> Date: Thu, 20 Jun 2024 15:19:10 -0700
>
> > Software GRO is currently controlled by a single switch, i.e.
> >
> > ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
>
> Implement it in the kernel then? :D
>
It would be a big commitment that I dare not make :) Out of curiosity,
is it something that devices can handle today?
> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
> >
> > Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > this skb, rather than having to be 0-or-1 for all traffic.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> > include/linux/netdevice.h | 9 +++++++--
> > include/linux/skbuff.h | 10 ++++++++++
> > net/Kconfig | 10 ++++++++++
> > net/core/gro.c | 2 +-
> > net/core/gro_cells.c | 2 +-
> > net/core/skbuff.c | 4 ++++
> > 6 files changed, 33 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index c83b390191d4..2ca0870b1221 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2415,11 +2415,16 @@ struct net_device {
> > ((dev)->devlink_port = (port)); \
> > })
> >
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > {
> > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > + return skb->gro_disabled;
> > +#else
> > return false;
> > +#endif
> > }
> >
> > #define NETDEV_ALIGN 32
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index f4cda3fbdb75..48b10ece95b5 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -1008,6 +1008,9 @@ struct sk_buff {
> > #if IS_ENABLED(CONFIG_IP_SCTP)
> > __u8 csum_not_inet:1;
> > #endif
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > + __u8 gro_disabled:1;
> > +#endif
> >
> > #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
> > __u16 tc_index; /* traffic control index */
> > @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
> > #endif
> > }
> >
> > +static inline void skb_disable_gro(struct sk_buff *skb)
> > +{
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > + skb->gro_disabled = 1;
> > +#endif
> > +}
> > +
> > /**
> > * skb_unref - decrement the skb's reference count
> > * @skb: buffer
> > diff --git a/net/Kconfig b/net/Kconfig
> > index 9fe65fa26e48..47d1ee92df15 100644
> > --- a/net/Kconfig
> > +++ b/net/Kconfig
> > @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
> > and in drivers using build_skb().
> > If unsure, say 17.
> >
> > +config SKB_GRO_CONTROL
> > + bool "allow disable GRO on per-packet basis"
> > + default y
> > + help
> > + By default GRO can only be enabled or disabled per network device.
> > + This can be cumbersome for certain scenarios.
> > + Toggling this option will allow disabling GRO for selected packets,
> > + e.g. by XDP programs, so that it is more flexibile.
> > + Extra overhead should be minimal.
>
> I don't think we need a Kconfig option for that. Can't it be
> unconditional? Is there any real eye-visible overhead?
Normally if it is a single branch I would not worry about it. But I
know I am touching a hot potato here so I just want to be cautious :)
best
Yan
>
> Thanks,
> Olek
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
2024-06-21 9:11 ` Alexander Lobakin
@ 2024-06-21 9:49 ` Paolo Abeni
2024-06-21 14:29 ` Yan Zhai
2024-06-21 9:57 ` Paolo Abeni
2024-06-21 12:15 ` Willem de Bruijn
3 siblings, 1 reply; 33+ messages in thread
From: Paolo Abeni @ 2024-06-21 9:49 UTC (permalink / raw)
To: Yan Zhai, netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Willem de Bruijn, Simon Horman, Florian Westphal, Mina Almasry,
Abhishek Chauhan, David Howells, Alexander Lobakin, David Ahern,
Richard Gobert, Antoine Tenart, Felix Fietkau,
Soheil Hassas Yeganeh, Pavel Begunkov, Lorenzo Bianconi,
Thomas Weißschuh, linux-kernel, bpf
On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
>
> ethtool -K dev gro on|off
>
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
>
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
>
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
>
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
> include/linux/netdevice.h | 9 +++++++--
> include/linux/skbuff.h | 10 ++++++++++
> net/Kconfig | 10 ++++++++++
> net/core/gro.c | 2 +-
> net/core/gro_cells.c | 2 +-
> net/core/skbuff.c | 4 ++++
> 6 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
> ((dev)->devlink_port = (port)); \
> })
>
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> {
> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + return skb->gro_disabled;
> +#else
> return false;
> +#endif
This will generate OoO if the gro_disabled is flipped in the middle of
a stream.
Assuming the above is fine for your use case (I think it's _not_ in
general), you could get the same result without an additional costly
bit in sk_buff.
Let xdp_frame_fixup_skb_offloading() return a bool - e.g. 'true' when
gro should be avoided - and let the NIC driver call netif_receive_skb()
instead of the gro rx hook for such packet.
All in all the approach implemented in this series does not look worthy
to me.
Thanks,
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 9:49 ` Paolo Abeni
@ 2024-06-21 14:29 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 14:29 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Willem de Bruijn, Simon Horman, Florian Westphal,
Mina Almasry, Abhishek Chauhan, David Howells, Alexander Lobakin,
David Ahern, Richard Gobert, Antoine Tenart, Felix Fietkau,
Soheil Hassas Yeganeh, Pavel Begunkov, Lorenzo Bianconi,
Thomas Weißschuh, linux-kernel, bpf
On Fri, Jun 21, 2024 at 4:49 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> > Software GRO is currently controlled by a single switch, i.e.
> >
> > ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
> >
> > Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > this skb, rather than having to be 0-or-1 for all traffic.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> > include/linux/netdevice.h | 9 +++++++--
> > include/linux/skbuff.h | 10 ++++++++++
> > net/Kconfig | 10 ++++++++++
> > net/core/gro.c | 2 +-
> > net/core/gro_cells.c | 2 +-
> > net/core/skbuff.c | 4 ++++
> > 6 files changed, 33 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index c83b390191d4..2ca0870b1221 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2415,11 +2415,16 @@ struct net_device {
> > ((dev)->devlink_port = (port)); \
> > })
> >
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > {
> > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > + return skb->gro_disabled;
> > +#else
> > return false;
> > +#endif
>
> This will generate OoO if the gro_disabled is flipped in the middle of
> a stream.
>
> Assuming the above is fine for your use case (I think it's _not_ in
> general), you could get the same result without an additional costly
> bit in sk_buff.
Calling it per-packet control seems inaccurate here, the motivation is
to give users the ability to control per-flow behaviors. OoO is indeed
a consequence if users don't do it correctly.
>
> Let xdp_frame_fixup_skb_offloading() return a bool - e.g. 'true' when
> gro should be avoided - and let the NIC driver call netif_receive_skb()
> instead of the gro rx hook for such packet.
>
For rx on a single device, directly calling netif_receive_skb is
reasonable. For tunnel receivers it is kinda inconsistent IMHO. For
example, we terminate GRE tunnels in a netns, and it is necessary to
disable GRO on both the entering veth device and also the GRE tunnel
to shutdown GRO. That's why I'd hope to use a bit of skb, to be
consistent within the same netns. Let me add a bit more context to
clarify why we think this is necessary in another thread.
best,
Yan
> All in all the approach implemented in this series does not look worthy
> to me.
>
> Thanks,
>
> Paolo
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
2024-06-21 9:11 ` Alexander Lobakin
2024-06-21 9:49 ` Paolo Abeni
@ 2024-06-21 9:57 ` Paolo Abeni
2024-06-21 15:17 ` Yan Zhai
2024-06-21 12:15 ` Willem de Bruijn
3 siblings, 1 reply; 33+ messages in thread
From: Paolo Abeni @ 2024-06-21 9:57 UTC (permalink / raw)
To: Yan Zhai, netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Willem de Bruijn, Simon Horman, Florian Westphal, Mina Almasry,
Abhishek Chauhan, David Howells, Alexander Lobakin, David Ahern,
Richard Gobert, Antoine Tenart, Felix Fietkau,
Soheil Hassas Yeganeh, Pavel Begunkov, Lorenzo Bianconi,
Thomas Weißschuh, linux-kernel, bpf
On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
>
> ethtool -K dev gro on|off
>
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
>
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
Could you please add more details WRT this last statement? I'm unsure
if I understand your problem. My guess is as follow:
Your device receive some traffic, GRO and forward it, and the multiple
encapsulation can happen on such forwarded traffic (since I can't find
almost none of the above your message is mainly a wild guess).
Assuming I guessed correctly, I think you could solve the problem with
no kernel changes: redirect the to-be-tunneled traffic to some virtual
device and all TX offload on top of it and let the encap happen there.
Cheers,
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 9:57 ` Paolo Abeni
@ 2024-06-21 15:17 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 15:17 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Willem de Bruijn, Simon Horman, Florian Westphal,
Mina Almasry, Abhishek Chauhan, David Howells, Alexander Lobakin,
David Ahern, Richard Gobert, Antoine Tenart, Felix Fietkau,
Soheil Hassas Yeganeh, Pavel Begunkov, Lorenzo Bianconi,
Thomas Weißschuh, linux-kernel, bpf
On Fri, Jun 21, 2024 at 4:57 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> > Software GRO is currently controlled by a single switch, i.e.
> >
> > ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
>
> Could you please add more details WRT this last statement? I'm unsure
> if I understand your problem. My guess is as follow:
>
> Your device receive some traffic, GRO and forward it, and the multiple
> encapsulation can happen on such forwarded traffic (since I can't find
> almost none of the above your message is mainly a wild guess).
>
> Assuming I guessed correctly, I think you could solve the problem with
> no kernel changes: redirect the to-be-tunneled traffic to some virtual
> device and all TX offload on top of it and let the encap happen there.
>
Let's say we have a netns to implement network functions like
DoS/IDS/Load balancing for IP traffic. The netns has a single veth
entrance/exit, and a bunch of ip tunnels, GRE/XFRM, to receive and
tunnel traffic from customer's private sites. Some of such traffic
could be encapsulated to reach services outside of the netns (but on
the same server), for example, customers may also want to use our
CDN/Caching functionality. The complication here is that we might have
to further tunnel traffic to another data center, because the routing
is asymmetric so we can receive client traffic from US but the
response may come back to our EU data center, and in order to do
layer4/layer7 service, we have to make sure those land on the same
server.
It is true that a device like a veth pair or even netkit could allow
the kernel segment GRO packets for us. But this does not sound
actually right in terms of design: if we know already some packet path
should not be GRO-ed, can we enforce this rather than having to
aggregate it then chop it down soon after? For our specific case
though, it also becomes a headache for analytics and customer rules
that rely on ingress device name, we probably need to pair each tunnel
with such a virtual device. There could be hundreds of ipsec tunnels,
and that seems to be a substantial overhead for both data path and
control plane management.
To make this a bit more general, what I'd like to introduce here is:
when we know GRO is either problematic or simply not useful (like to
some UDP traffic), can we have more control toggle to skip it?
thanks
Yan
> Cheers,
>
> Paolo
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
` (2 preceding siblings ...)
2024-06-21 9:57 ` Paolo Abeni
@ 2024-06-21 12:15 ` Willem de Bruijn
2024-06-21 12:47 ` Daniel Borkmann
2024-06-21 15:34 ` Yan Zhai
3 siblings, 2 replies; 33+ messages in thread
From: Willem de Bruijn @ 2024-06-21 12:15 UTC (permalink / raw)
To: Yan Zhai, netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Willem de Bruijn, Simon Horman, Florian Westphal,
Mina Almasry, Abhishek Chauhan, David Howells, Alexander Lobakin,
David Ahern, Richard Gobert, Antoine Tenart, Yan Zhai,
Felix Fietkau, Soheil Hassas Yeganeh, Pavel Begunkov,
Lorenzo Bianconi, Thomas Weißschuh, netdev, linux-kernel,
bpf
Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
>
> ethtool -K dev gro on|off
>
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
>
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
>
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
>
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
> include/linux/netdevice.h | 9 +++++++--
> include/linux/skbuff.h | 10 ++++++++++
> net/Kconfig | 10 ++++++++++
> net/core/gro.c | 2 +-
> net/core/gro_cells.c | 2 +-
> net/core/skbuff.c | 4 ++++
> 6 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
> ((dev)->devlink_port = (port)); \
> })
>
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> {
> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + return skb->gro_disabled;
> +#else
> return false;
> +#endif
Yet more branches in the hot path.
Compile time configurability does not help, as that will be
enabled by distros.
For a fairly niche use case. Where functionality of GRO already
works. So just a performance for a very rare case at the cost of a
regression in the common case. A small regression perhaps, but death
by a thousand cuts.
> }
>
> #define NETDEV_ALIGN 32
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f4cda3fbdb75..48b10ece95b5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1008,6 +1008,9 @@ struct sk_buff {
> #if IS_ENABLED(CONFIG_IP_SCTP)
> __u8 csum_not_inet:1;
> #endif
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + __u8 gro_disabled:1;
> +#endif
>
> #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
> __u16 tc_index; /* traffic control index */
> @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
> #endif
> }
>
> +static inline void skb_disable_gro(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + skb->gro_disabled = 1;
> +#endif
> +}
> +
> /**
> * skb_unref - decrement the skb's reference count
> * @skb: buffer
> diff --git a/net/Kconfig b/net/Kconfig
> index 9fe65fa26e48..47d1ee92df15 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
> and in drivers using build_skb().
> If unsure, say 17.
>
> +config SKB_GRO_CONTROL
> + bool "allow disable GRO on per-packet basis"
> + default y
> + help
> + By default GRO can only be enabled or disabled per network device.
> + This can be cumbersome for certain scenarios.
> + Toggling this option will allow disabling GRO for selected packets,
> + e.g. by XDP programs, so that it is more flexibile.
> + Extra overhead should be minimal.
> +
> config RPS
> bool "Receive packet steering"
> depends on SMP && SYSFS
> diff --git a/net/core/gro.c b/net/core/gro.c
> index b3b43de1a650..46232a0d1983 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -476,7 +476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> enum gro_result ret;
> int same_flow;
>
> - if (netif_elide_gro(skb->dev))
> + if (netif_elide_gro(skb))
> goto normal;
>
> gro_list_prepare(&gro_list->list, skb);
> diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
> index ff8e5b64bf6b..1bf15783300f 100644
> --- a/net/core/gro_cells.c
> +++ b/net/core/gro_cells.c
> @@ -20,7 +20,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
> if (unlikely(!(dev->flags & IFF_UP)))
> goto drop;
>
> - if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
> + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
> res = netif_rx(skb);
> goto unlock;
> }
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2315c088e91d..82bd297921c1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6030,6 +6030,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
> ipvs_reset(skb);
> skb->mark = 0;
> skb_clear_tstamp(skb);
> +#ifdef CONFIG_SKB_GRO_CONTROL
> + /* hand back GRO control to next netns */
> + skb->gro_disabled = 0;
> +#endif
> }
> EXPORT_SYMBOL_GPL(skb_scrub_packet);
>
> --
> 2.30.2
>
>
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 12:15 ` Willem de Bruijn
@ 2024-06-21 12:47 ` Daniel Borkmann
2024-06-21 16:00 ` Yan Zhai
2024-06-21 15:34 ` Yan Zhai
1 sibling, 1 reply; 33+ messages in thread
From: Daniel Borkmann @ 2024-06-21 12:47 UTC (permalink / raw)
To: Willem de Bruijn, Yan Zhai, netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Jesper Dangaard Brouer, John Fastabend,
Willem de Bruijn, Simon Horman, Florian Westphal, Mina Almasry,
Abhishek Chauhan, David Howells, Alexander Lobakin, David Ahern,
Richard Gobert, Antoine Tenart, Felix Fietkau,
Soheil Hassas Yeganeh, Pavel Begunkov, Lorenzo Bianconi,
Thomas Weißschuh, linux-kernel, bpf
On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> Yan Zhai wrote:
>> Software GRO is currently controlled by a single switch, i.e.
>>
>> ethtool -K dev gro on|off
>>
>> However, this is not always desired. When GRO is enabled, even if the
>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>> handlers with no benefit.
>>
>> There are also scenarios that turning off GRO is a requirement. For
>> example, our production environment has a scenario that a TC egress hook
>> may add multiple encapsulation headers to forwarded skbs for load
>> balancing and isolation purpose. The encapsulation is implemented via
>> BPF. But the problem arises then: there is no way to properly offload a
>> double-encapsulated packet, since skb only has network_header and
>> inner_network_header to track one layer of encapsulation, but not two.
>> On the other hand, not all the traffic through this device needs double
>> encapsulation. But we have to turn off GRO completely for any ingress
>> device as a result.
>>
>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>> this skb, rather than having to be 0-or-1 for all traffic.
>>
>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>> ---
>> include/linux/netdevice.h | 9 +++++++--
>> include/linux/skbuff.h | 10 ++++++++++
>> net/Kconfig | 10 ++++++++++
>> net/core/gro.c | 2 +-
>> net/core/gro_cells.c | 2 +-
>> net/core/skbuff.c | 4 ++++
>> 6 files changed, 33 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index c83b390191d4..2ca0870b1221 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -2415,11 +2415,16 @@ struct net_device {
>> ((dev)->devlink_port = (port)); \
>> })
>>
>> -static inline bool netif_elide_gro(const struct net_device *dev)
>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>> {
>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>> return true;
>> +
>> +#ifdef CONFIG_SKB_GRO_CONTROL
>> + return skb->gro_disabled;
>> +#else
>> return false;
>> +#endif
>
> Yet more branches in the hot path.
>
> Compile time configurability does not help, as that will be
> enabled by distros.
>
> For a fairly niche use case. Where functionality of GRO already
> works. So just a performance for a very rare case at the cost of a
> regression in the common case. A small regression perhaps, but death
> by a thousand cuts.
Mentioning it here b/c it perhaps fits in this context, longer time ago
there was the idea mentioned to have BPF operating as GRO engine which
might also help to reduce attack surface by only having to handle packets
of interest for the concrete production use case. Perhaps here meta data
buffer could be used to pass a notification from XDP to exit early w/o
aggregation.
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 12:47 ` Daniel Borkmann
@ 2024-06-21 16:00 ` Yan Zhai
2024-06-21 16:15 ` Daniel Borkmann
0 siblings, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 16:00 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Willem de Bruijn, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> > Yan Zhai wrote:
> >> Software GRO is currently controlled by a single switch, i.e.
> >>
> >> ethtool -K dev gro on|off
> >>
> >> However, this is not always desired. When GRO is enabled, even if the
> >> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >> handlers with no benefit.
> >>
> >> There are also scenarios that turning off GRO is a requirement. For
> >> example, our production environment has a scenario that a TC egress hook
> >> may add multiple encapsulation headers to forwarded skbs for load
> >> balancing and isolation purpose. The encapsulation is implemented via
> >> BPF. But the problem arises then: there is no way to properly offload a
> >> double-encapsulated packet, since skb only has network_header and
> >> inner_network_header to track one layer of encapsulation, but not two.
> >> On the other hand, not all the traffic through this device needs double
> >> encapsulation. But we have to turn off GRO completely for any ingress
> >> device as a result.
> >>
> >> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >> this skb, rather than having to be 0-or-1 for all traffic.
> >>
> >> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >> ---
> >> include/linux/netdevice.h | 9 +++++++--
> >> include/linux/skbuff.h | 10 ++++++++++
> >> net/Kconfig | 10 ++++++++++
> >> net/core/gro.c | 2 +-
> >> net/core/gro_cells.c | 2 +-
> >> net/core/skbuff.c | 4 ++++
> >> 6 files changed, 33 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >> index c83b390191d4..2ca0870b1221 100644
> >> --- a/include/linux/netdevice.h
> >> +++ b/include/linux/netdevice.h
> >> @@ -2415,11 +2415,16 @@ struct net_device {
> >> ((dev)->devlink_port = (port)); \
> >> })
> >>
> >> -static inline bool netif_elide_gro(const struct net_device *dev)
> >> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >> {
> >> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >> return true;
> >> +
> >> +#ifdef CONFIG_SKB_GRO_CONTROL
> >> + return skb->gro_disabled;
> >> +#else
> >> return false;
> >> +#endif
> >
> > Yet more branches in the hot path.
> >
> > Compile time configurability does not help, as that will be
> > enabled by distros.
> >
> > For a fairly niche use case. Where functionality of GRO already
> > works. So just a performance for a very rare case at the cost of a
> > regression in the common case. A small regression perhaps, but death
> > by a thousand cuts.
>
> Mentioning it here b/c it perhaps fits in this context, longer time ago
> there was the idea mentioned to have BPF operating as GRO engine which
> might also help to reduce attack surface by only having to handle packets
> of interest for the concrete production use case. Perhaps here meta data
> buffer could be used to pass a notification from XDP to exit early w/o
> aggregation.
Metadata is in fact one of our interests as well. We discussed using
metadata instead of a skb bit to carry this information internally.
Since metadata is opaque atm so it seems the only option is to have a
GRO control hook before napi_gro_receive, and let BPF decide
netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
it could indeed be more flexible, but the cons is that it could be
even more slower than taking a bit on skb. I am actually open to
either approach, as long as it gives us more control on when to enable
GRO :)
To extend the discussion a bit, putting GRO aside, I think some common
hook before GRO would be still valuable moving forward: it is a
limited window where the driver code has both access to XDP context
and skb. Today we do not have a good way to transfer HW offloading
info to skbs if XDP redirect-to-cpu or if XDP encap-and-tx for load
balancing purposes. The XDP metadata infrastructure already allows XDP
to read this information with driver supports, so to complete that, a
place to use it (which I introduced as
xdp_buff/frame_fixup_skb_offloading in a later patch) would be
beneficial to pass on things like the flow hash, vlan information,
etc.
best
Yan
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 16:00 ` Yan Zhai
@ 2024-06-21 16:15 ` Daniel Borkmann
2024-06-21 17:20 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Daniel Borkmann @ 2024-06-21 16:15 UTC (permalink / raw)
To: Yan Zhai
Cc: Willem de Bruijn, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
On 6/21/24 6:00 PM, Yan Zhai wrote:
> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
>>> Yan Zhai wrote:
>>>> Software GRO is currently controlled by a single switch, i.e.
>>>>
>>>> ethtool -K dev gro on|off
>>>>
>>>> However, this is not always desired. When GRO is enabled, even if the
>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>>>> handlers with no benefit.
>>>>
>>>> There are also scenarios that turning off GRO is a requirement. For
>>>> example, our production environment has a scenario that a TC egress hook
>>>> may add multiple encapsulation headers to forwarded skbs for load
>>>> balancing and isolation purpose. The encapsulation is implemented via
>>>> BPF. But the problem arises then: there is no way to properly offload a
>>>> double-encapsulated packet, since skb only has network_header and
>>>> inner_network_header to track one layer of encapsulation, but not two.
>>>> On the other hand, not all the traffic through this device needs double
>>>> encapsulation. But we have to turn off GRO completely for any ingress
>>>> device as a result.
>>>>
>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>>>> this skb, rather than having to be 0-or-1 for all traffic.
>>>>
>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>>>> ---
>>>> include/linux/netdevice.h | 9 +++++++--
>>>> include/linux/skbuff.h | 10 ++++++++++
>>>> net/Kconfig | 10 ++++++++++
>>>> net/core/gro.c | 2 +-
>>>> net/core/gro_cells.c | 2 +-
>>>> net/core/skbuff.c | 4 ++++
>>>> 6 files changed, 33 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index c83b390191d4..2ca0870b1221 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -2415,11 +2415,16 @@ struct net_device {
>>>> ((dev)->devlink_port = (port)); \
>>>> })
>>>>
>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>>>> {
>>>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>>>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>>>> return true;
>>>> +
>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
>>>> + return skb->gro_disabled;
>>>> +#else
>>>> return false;
>>>> +#endif
>>>
>>> Yet more branches in the hot path.
>>>
>>> Compile time configurability does not help, as that will be
>>> enabled by distros.
>>>
>>> For a fairly niche use case. Where functionality of GRO already
>>> works. So just a performance for a very rare case at the cost of a
>>> regression in the common case. A small regression perhaps, but death
>>> by a thousand cuts.
>>
>> Mentioning it here b/c it perhaps fits in this context, longer time ago
>> there was the idea mentioned to have BPF operating as GRO engine which
>> might also help to reduce attack surface by only having to handle packets
>> of interest for the concrete production use case. Perhaps here meta data
>> buffer could be used to pass a notification from XDP to exit early w/o
>> aggregation.
>
> Metadata is in fact one of our interests as well. We discussed using
> metadata instead of a skb bit to carry this information internally.
> Since metadata is opaque atm so it seems the only option is to have a
> GRO control hook before napi_gro_receive, and let BPF decide
> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> it could indeed be more flexible, but the cons is that it could be
> even more slower than taking a bit on skb. I am actually open to
> either approach, as long as it gives us more control on when to enable
> GRO :)
Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
counter map in XDP? For packets which should not be GRO-aggregated you
add count++ into the meta data area, and this forces GRO to not aggregate
since meta data that needs to be transported to tc BPF layer mismatches
(and therefore the contract/intent is that tc BPF needs to see the different
meta data passed to it).
Thanks,
Daniel
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 16:15 ` Daniel Borkmann
@ 2024-06-21 17:20 ` Yan Zhai
2024-06-23 8:23 ` Willem de Bruijn
0 siblings, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 17:20 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: netdev, linux-kernel, bpf, kernel-team
On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/21/24 6:00 PM, Yan Zhai wrote:
> > On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> >>> Yan Zhai wrote:
> >>>> Software GRO is currently controlled by a single switch, i.e.
> >>>>
> >>>> ethtool -K dev gro on|off
> >>>>
> >>>> However, this is not always desired. When GRO is enabled, even if the
> >>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >>>> handlers with no benefit.
> >>>>
> >>>> There are also scenarios that turning off GRO is a requirement. For
> >>>> example, our production environment has a scenario that a TC egress hook
> >>>> may add multiple encapsulation headers to forwarded skbs for load
> >>>> balancing and isolation purpose. The encapsulation is implemented via
> >>>> BPF. But the problem arises then: there is no way to properly offload a
> >>>> double-encapsulated packet, since skb only has network_header and
> >>>> inner_network_header to track one layer of encapsulation, but not two.
> >>>> On the other hand, not all the traffic through this device needs double
> >>>> encapsulation. But we have to turn off GRO completely for any ingress
> >>>> device as a result.
> >>>>
> >>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >>>> this skb, rather than having to be 0-or-1 for all traffic.
> >>>>
> >>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >>>> ---
> >>>> include/linux/netdevice.h | 9 +++++++--
> >>>> include/linux/skbuff.h | 10 ++++++++++
> >>>> net/Kconfig | 10 ++++++++++
> >>>> net/core/gro.c | 2 +-
> >>>> net/core/gro_cells.c | 2 +-
> >>>> net/core/skbuff.c | 4 ++++
> >>>> 6 files changed, 33 insertions(+), 4 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >>>> index c83b390191d4..2ca0870b1221 100644
> >>>> --- a/include/linux/netdevice.h
> >>>> +++ b/include/linux/netdevice.h
> >>>> @@ -2415,11 +2415,16 @@ struct net_device {
> >>>> ((dev)->devlink_port = (port)); \
> >>>> })
> >>>>
> >>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> >>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >>>> {
> >>>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >>>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >>>> return true;
> >>>> +
> >>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> >>>> + return skb->gro_disabled;
> >>>> +#else
> >>>> return false;
> >>>> +#endif
> >>>
> >>> Yet more branches in the hot path.
> >>>
> >>> Compile time configurability does not help, as that will be
> >>> enabled by distros.
> >>>
> >>> For a fairly niche use case. Where functionality of GRO already
> >>> works. So just a performance for a very rare case at the cost of a
> >>> regression in the common case. A small regression perhaps, but death
> >>> by a thousand cuts.
> >>
> >> Mentioning it here b/c it perhaps fits in this context, longer time ago
> >> there was the idea mentioned to have BPF operating as GRO engine which
> >> might also help to reduce attack surface by only having to handle packets
> >> of interest for the concrete production use case. Perhaps here meta data
> >> buffer could be used to pass a notification from XDP to exit early w/o
> >> aggregation.
> >
> > Metadata is in fact one of our interests as well. We discussed using
> > metadata instead of a skb bit to carry this information internally.
> > Since metadata is opaque atm so it seems the only option is to have a
> > GRO control hook before napi_gro_receive, and let BPF decide
> > netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> > it could indeed be more flexible, but the cons is that it could be
> > even more slower than taking a bit on skb. I am actually open to
> > either approach, as long as it gives us more control on when to enable
> > GRO :)
>
> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> counter map in XDP? For packets which should not be GRO-aggregated you
> add count++ into the meta data area, and this forces GRO to not aggregate
> since meta data that needs to be transported to tc BPF layer mismatches
> (and therefore the contract/intent is that tc BPF needs to see the different
> meta data passed to it).
>
Very very sorry to resendx2 :( Not sure why my laptop also decided to
switch on html... I removed CCs from the message hopefully it reduces
some noises...
We did this before accidentally (we put a timestamp for debugging
purposes in metadata) and this actually caused about 20% of OoO for
TCP in production: all PSH packets are reordered. GRO does not fire
the packet to the upper layer when a diff in metadata is found for a
non-PSH packet, instead it is queued as a “new flow” on the GRO list
and waits for flushing. When a PSH packet arrives, its semantic is to
flush this packet immediately and thus precedes earlier packets of the
same flow.
The artifact of this behavior can also cause latency increase and hash
degradation since the mismatch does not result in flushing, it results
in extra queuing on the same hash list, until the list is flushed.
It’s another reason we want to disable GRO when we know metadata can
be set differently for tracing purposes (I didn’t mention this though
because it seems distracting).
Thanks
Yan
> Thanks,
> Daniel
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 17:20 ` Yan Zhai
@ 2024-06-23 8:23 ` Willem de Bruijn
2024-06-24 13:30 ` Daniel Borkmann
0 siblings, 1 reply; 33+ messages in thread
From: Willem de Bruijn @ 2024-06-23 8:23 UTC (permalink / raw)
To: Yan Zhai, Daniel Borkmann; +Cc: netdev, linux-kernel, bpf, kernel-team
Yan Zhai wrote:
> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 6/21/24 6:00 PM, Yan Zhai wrote:
> > > On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> > >> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> > >>> Yan Zhai wrote:
> > >>>> Software GRO is currently controlled by a single switch, i.e.
> > >>>>
> > >>>> ethtool -K dev gro on|off
> > >>>>
> > >>>> However, this is not always desired. When GRO is enabled, even if the
> > >>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> > >>>> handlers with no benefit.
> > >>>>
> > >>>> There are also scenarios that turning off GRO is a requirement. For
> > >>>> example, our production environment has a scenario that a TC egress hook
> > >>>> may add multiple encapsulation headers to forwarded skbs for load
> > >>>> balancing and isolation purpose. The encapsulation is implemented via
> > >>>> BPF. But the problem arises then: there is no way to properly offload a
> > >>>> double-encapsulated packet, since skb only has network_header and
> > >>>> inner_network_header to track one layer of encapsulation, but not two.
> > >>>> On the other hand, not all the traffic through this device needs double
> > >>>> encapsulation. But we have to turn off GRO completely for any ingress
> > >>>> device as a result.
> > >>>>
> > >>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > >>>> this skb, rather than having to be 0-or-1 for all traffic.
> > >>>>
> > >>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > >>>> ---
> > >>>> include/linux/netdevice.h | 9 +++++++--
> > >>>> include/linux/skbuff.h | 10 ++++++++++
> > >>>> net/Kconfig | 10 ++++++++++
> > >>>> net/core/gro.c | 2 +-
> > >>>> net/core/gro_cells.c | 2 +-
> > >>>> net/core/skbuff.c | 4 ++++
> > >>>> 6 files changed, 33 insertions(+), 4 deletions(-)
> > >>>>
> > >>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > >>>> index c83b390191d4..2ca0870b1221 100644
> > >>>> --- a/include/linux/netdevice.h
> > >>>> +++ b/include/linux/netdevice.h
> > >>>> @@ -2415,11 +2415,16 @@ struct net_device {
> > >>>> ((dev)->devlink_port = (port)); \
> > >>>> })
> > >>>>
> > >>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> > >>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > >>>> {
> > >>>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > >>>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > >>>> return true;
> > >>>> +
> > >>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> > >>>> + return skb->gro_disabled;
> > >>>> +#else
> > >>>> return false;
> > >>>> +#endif
> > >>>
> > >>> Yet more branches in the hot path.
> > >>>
> > >>> Compile time configurability does not help, as that will be
> > >>> enabled by distros.
> > >>>
> > >>> For a fairly niche use case. Where functionality of GRO already
> > >>> works. So just a performance for a very rare case at the cost of a
> > >>> regression in the common case. A small regression perhaps, but death
> > >>> by a thousand cuts.
> > >>
> > >> Mentioning it here b/c it perhaps fits in this context, longer time ago
> > >> there was the idea mentioned to have BPF operating as GRO engine which
> > >> might also help to reduce attack surface by only having to handle packets
> > >> of interest for the concrete production use case. Perhaps here meta data
> > >> buffer could be used to pass a notification from XDP to exit early w/o
> > >> aggregation.
> > >
> > > Metadata is in fact one of our interests as well. We discussed using
> > > metadata instead of a skb bit to carry this information internally.
> > > Since metadata is opaque atm so it seems the only option is to have a
> > > GRO control hook before napi_gro_receive, and let BPF decide
> > > netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> > > it could indeed be more flexible, but the cons is that it could be
> > > even more slower than taking a bit on skb. I am actually open to
> > > either approach, as long as it gives us more control on when to enable
> > > GRO :)
> >
> > Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> > counter map in XDP? For packets which should not be GRO-aggregated you
> > add count++ into the meta data area, and this forces GRO to not aggregate
> > since meta data that needs to be transported to tc BPF layer mismatches
> > (and therefore the contract/intent is that tc BPF needs to see the different
> > meta data passed to it).
> >
>
> We did this before accidentally (we put a timestamp for debugging
> purposes in metadata) and this actually caused about 20% of OoO for
> TCP in production: all PSH packets are reordered. GRO does not fire
> the packet to the upper layer when a diff in metadata is found for a
> non-PSH packet, instead it is queued as a “new flow” on the GRO list
> and waits for flushing. When a PSH packet arrives, its semantic is to
> flush this packet immediately and thus precedes earlier packets of the
> same flow.
Is that a bug in XDP metadata handling for GRO?
Mismatching metadata should not be taken as separate flows, but as a
flush condition.
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-23 8:23 ` Willem de Bruijn
@ 2024-06-24 13:30 ` Daniel Borkmann
2024-06-24 17:49 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Daniel Borkmann @ 2024-06-24 13:30 UTC (permalink / raw)
To: Willem de Bruijn, Yan Zhai; +Cc: netdev, linux-kernel, bpf, kernel-team
On 6/23/24 10:23 AM, Willem de Bruijn wrote:
> Yan Zhai wrote:
>> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> On 6/21/24 6:00 PM, Yan Zhai wrote:
>>>> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
>>>>>> Yan Zhai wrote:
>>>>>>> Software GRO is currently controlled by a single switch, i.e.
>>>>>>>
>>>>>>> ethtool -K dev gro on|off
>>>>>>>
>>>>>>> However, this is not always desired. When GRO is enabled, even if the
>>>>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>>>>>>> handlers with no benefit.
>>>>>>>
>>>>>>> There are also scenarios that turning off GRO is a requirement. For
>>>>>>> example, our production environment has a scenario that a TC egress hook
>>>>>>> may add multiple encapsulation headers to forwarded skbs for load
>>>>>>> balancing and isolation purpose. The encapsulation is implemented via
>>>>>>> BPF. But the problem arises then: there is no way to properly offload a
>>>>>>> double-encapsulated packet, since skb only has network_header and
>>>>>>> inner_network_header to track one layer of encapsulation, but not two.
>>>>>>> On the other hand, not all the traffic through this device needs double
>>>>>>> encapsulation. But we have to turn off GRO completely for any ingress
>>>>>>> device as a result.
>>>>>>>
>>>>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>>>>>>> this skb, rather than having to be 0-or-1 for all traffic.
>>>>>>>
>>>>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>>>>>>> ---
>>>>>>> include/linux/netdevice.h | 9 +++++++--
>>>>>>> include/linux/skbuff.h | 10 ++++++++++
>>>>>>> net/Kconfig | 10 ++++++++++
>>>>>>> net/core/gro.c | 2 +-
>>>>>>> net/core/gro_cells.c | 2 +-
>>>>>>> net/core/skbuff.c | 4 ++++
>>>>>>> 6 files changed, 33 insertions(+), 4 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>>> index c83b390191d4..2ca0870b1221 100644
>>>>>>> --- a/include/linux/netdevice.h
>>>>>>> +++ b/include/linux/netdevice.h
>>>>>>> @@ -2415,11 +2415,16 @@ struct net_device {
>>>>>>> ((dev)->devlink_port = (port)); \
>>>>>>> })
>>>>>>>
>>>>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
>>>>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>>>>>>> {
>>>>>>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>>>>>>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>>>>>>> return true;
>>>>>>> +
>>>>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
>>>>>>> + return skb->gro_disabled;
>>>>>>> +#else
>>>>>>> return false;
>>>>>>> +#endif
>>>>>>
>>>>>> Yet more branches in the hot path.
>>>>>>
>>>>>> Compile time configurability does not help, as that will be
>>>>>> enabled by distros.
>>>>>>
>>>>>> For a fairly niche use case. Where functionality of GRO already
>>>>>> works. So just a performance for a very rare case at the cost of a
>>>>>> regression in the common case. A small regression perhaps, but death
>>>>>> by a thousand cuts.
>>>>>
>>>>> Mentioning it here b/c it perhaps fits in this context, longer time ago
>>>>> there was the idea mentioned to have BPF operating as GRO engine which
>>>>> might also help to reduce attack surface by only having to handle packets
>>>>> of interest for the concrete production use case. Perhaps here meta data
>>>>> buffer could be used to pass a notification from XDP to exit early w/o
>>>>> aggregation.
>>>>
>>>> Metadata is in fact one of our interests as well. We discussed using
>>>> metadata instead of a skb bit to carry this information internally.
>>>> Since metadata is opaque atm so it seems the only option is to have a
>>>> GRO control hook before napi_gro_receive, and let BPF decide
>>>> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
>>>> it could indeed be more flexible, but the cons is that it could be
>>>> even more slower than taking a bit on skb. I am actually open to
>>>> either approach, as long as it gives us more control on when to enable
>>>> GRO :)
>>>
>>> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
>>> counter map in XDP? For packets which should not be GRO-aggregated you
>>> add count++ into the meta data area, and this forces GRO to not aggregate
>>> since meta data that needs to be transported to tc BPF layer mismatches
>>> (and therefore the contract/intent is that tc BPF needs to see the different
>>> meta data passed to it).
>>
>> We did this before accidentally (we put a timestamp for debugging
>> purposes in metadata) and this actually caused about 20% of OoO for
>> TCP in production: all PSH packets are reordered. GRO does not fire
>> the packet to the upper layer when a diff in metadata is found for a
>> non-PSH packet, instead it is queued as a “new flow” on the GRO list
>> and waits for flushing. When a PSH packet arrives, its semantic is to
>> flush this packet immediately and thus precedes earlier packets of the
>> same flow.
>
> Is that a bug in XDP metadata handling for GRO?
>
> Mismatching metadata should not be taken as separate flows, but as a
> flush condition.
Definitely a bug as it should flush. If noone is faster I can add it to my
backlog todo to fix it, but might probably take a week before I get to it.
Thanks,
Daniel
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-24 13:30 ` Daniel Borkmann
@ 2024-06-24 17:49 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-24 17:49 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Willem de Bruijn, netdev, linux-kernel, bpf, kernel-team
On Mon, Jun 24, 2024 at 8:30 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/23/24 10:23 AM, Willem de Bruijn wrote:
> > Yan Zhai wrote:
> >> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>> On 6/21/24 6:00 PM, Yan Zhai wrote:
> >>>> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> >>>>>> Yan Zhai wrote:
> >>>>>>> Software GRO is currently controlled by a single switch, i.e.
> >>>>>>>
> >>>>>>> ethtool -K dev gro on|off
> >>>>>>>
> >>>>>>> However, this is not always desired. When GRO is enabled, even if the
> >>>>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >>>>>>> handlers with no benefit.
> >>>>>>>
> >>>>>>> There are also scenarios that turning off GRO is a requirement. For
> >>>>>>> example, our production environment has a scenario that a TC egress hook
> >>>>>>> may add multiple encapsulation headers to forwarded skbs for load
> >>>>>>> balancing and isolation purpose. The encapsulation is implemented via
> >>>>>>> BPF. But the problem arises then: there is no way to properly offload a
> >>>>>>> double-encapsulated packet, since skb only has network_header and
> >>>>>>> inner_network_header to track one layer of encapsulation, but not two.
> >>>>>>> On the other hand, not all the traffic through this device needs double
> >>>>>>> encapsulation. But we have to turn off GRO completely for any ingress
> >>>>>>> device as a result.
> >>>>>>>
> >>>>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >>>>>>> this skb, rather than having to be 0-or-1 for all traffic.
> >>>>>>>
> >>>>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >>>>>>> ---
> >>>>>>> include/linux/netdevice.h | 9 +++++++--
> >>>>>>> include/linux/skbuff.h | 10 ++++++++++
> >>>>>>> net/Kconfig | 10 ++++++++++
> >>>>>>> net/core/gro.c | 2 +-
> >>>>>>> net/core/gro_cells.c | 2 +-
> >>>>>>> net/core/skbuff.c | 4 ++++
> >>>>>>> 6 files changed, 33 insertions(+), 4 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >>>>>>> index c83b390191d4..2ca0870b1221 100644
> >>>>>>> --- a/include/linux/netdevice.h
> >>>>>>> +++ b/include/linux/netdevice.h
> >>>>>>> @@ -2415,11 +2415,16 @@ struct net_device {
> >>>>>>> ((dev)->devlink_port = (port)); \
> >>>>>>> })
> >>>>>>>
> >>>>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> >>>>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >>>>>>> {
> >>>>>>> - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >>>>>>> + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >>>>>>> return true;
> >>>>>>> +
> >>>>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> >>>>>>> + return skb->gro_disabled;
> >>>>>>> +#else
> >>>>>>> return false;
> >>>>>>> +#endif
> >>>>>>
> >>>>>> Yet more branches in the hot path.
> >>>>>>
> >>>>>> Compile time configurability does not help, as that will be
> >>>>>> enabled by distros.
> >>>>>>
> >>>>>> For a fairly niche use case. Where functionality of GRO already
> >>>>>> works. So just a performance for a very rare case at the cost of a
> >>>>>> regression in the common case. A small regression perhaps, but death
> >>>>>> by a thousand cuts.
> >>>>>
> >>>>> Mentioning it here b/c it perhaps fits in this context, longer time ago
> >>>>> there was the idea mentioned to have BPF operating as GRO engine which
> >>>>> might also help to reduce attack surface by only having to handle packets
> >>>>> of interest for the concrete production use case. Perhaps here meta data
> >>>>> buffer could be used to pass a notification from XDP to exit early w/o
> >>>>> aggregation.
> >>>>
> >>>> Metadata is in fact one of our interests as well. We discussed using
> >>>> metadata instead of a skb bit to carry this information internally.
> >>>> Since metadata is opaque atm so it seems the only option is to have a
> >>>> GRO control hook before napi_gro_receive, and let BPF decide
> >>>> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> >>>> it could indeed be more flexible, but the cons is that it could be
> >>>> even more slower than taking a bit on skb. I am actually open to
> >>>> either approach, as long as it gives us more control on when to enable
> >>>> GRO :)
> >>>
> >>> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> >>> counter map in XDP? For packets which should not be GRO-aggregated you
> >>> add count++ into the meta data area, and this forces GRO to not aggregate
> >>> since meta data that needs to be transported to tc BPF layer mismatches
> >>> (and therefore the contract/intent is that tc BPF needs to see the different
> >>> meta data passed to it).
> >>
> >> We did this before accidentally (we put a timestamp for debugging
> >> purposes in metadata) and this actually caused about 20% of OoO for
> >> TCP in production: all PSH packets are reordered. GRO does not fire
> >> the packet to the upper layer when a diff in metadata is found for a
> >> non-PSH packet, instead it is queued as a “new flow” on the GRO list
> >> and waits for flushing. When a PSH packet arrives, its semantic is to
> >> flush this packet immediately and thus precedes earlier packets of the
> >> same flow.
> >
> > Is that a bug in XDP metadata handling for GRO?
> >
> > Mismatching metadata should not be taken as separate flows, but as a
> > flush condition.
>
> Definitely a bug as it should flush. If noone is faster I can add it to my
> backlog todo to fix it, but might probably take a week before I get to it.
>
In theory we should flush if the same flow has different metadata.
However "same flow" is not finally confirmed until GRO proceeds to the
TCP layer in this case. So to achieve this flush semantic, metadata
should be compared at the leaf protocol handlers instead of initially
inside dev_gro_receive. I am not quite sure if it is worthwhile, or
just allow skipping GRO from XDP, since it's the XDP program who sets
this metadata.
Yan
> Thanks,
> Daniel
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 12:15 ` Willem de Bruijn
2024-06-21 12:47 ` Daniel Borkmann
@ 2024-06-21 15:34 ` Yan Zhai
2024-06-23 8:27 ` Willem de Bruijn
1 sibling, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 15:34 UTC (permalink / raw)
To: Willem de Bruijn
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > {
> > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > + return skb->gro_disabled;
> > +#else
> > return false;
> > +#endif
>
> Yet more branches in the hot path.
>
> Compile time configurability does not help, as that will be
> enabled by distros.
>
> For a fairly niche use case. Where functionality of GRO already
> works. So just a performance for a very rare case at the cost of a
> regression in the common case. A small regression perhaps, but death
> by a thousand cuts.
>
I share your concern on operating on this hotpath. Will a
static_branch + sysctl make it less aggressive? Speaking of
performance, I'd hope this can give us more control so we can achieve
the best of two worlds: for TCP and some UDP traffic, we can enable
GRO, while for some other classes that we know GRO does no good or
even harm, let's disable GRO to save more cycles. The key observation
is that developers may already know which traffic is blessed by GRO,
but lack a way to realize it.
best
Yan
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-21 15:34 ` Yan Zhai
@ 2024-06-23 8:27 ` Willem de Bruijn
2024-06-24 18:17 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Willem de Bruijn @ 2024-06-23 8:27 UTC (permalink / raw)
To: Yan Zhai, Willem de Bruijn
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
Yan Zhai wrote:
> > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > {
> > > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > return true;
> > > +
> > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > + return skb->gro_disabled;
> > > +#else
> > > return false;
> > > +#endif
> >
> > Yet more branches in the hot path.
> >
> > Compile time configurability does not help, as that will be
> > enabled by distros.
> >
> > For a fairly niche use case. Where functionality of GRO already
> > works. So just a performance for a very rare case at the cost of a
> > regression in the common case. A small regression perhaps, but death
> > by a thousand cuts.
> >
>
> I share your concern on operating on this hotpath. Will a
> static_branch + sysctl make it less aggressive?
That is always a possibility. But we have to use it judiciously,
cannot add a sysctl for every branch.
I'm still of the opinion that Paolo shared that this seems a lot of
complexity for a fairly minor performance optimization for a rare
case.
> Speaking of
> performance, I'd hope this can give us more control so we can achieve
> the best of two worlds: for TCP and some UDP traffic, we can enable
> GRO, while for some other classes that we know GRO does no good or
> even harm, let's disable GRO to save more cycles. The key observation
> is that developers may already know which traffic is blessed by GRO,
> but lack a way to realize it.
Following up also on Daniel's point on using BPF as GRO engine. Even
earlier I tried to add an option to selectively enable GRO protocols
without BPF. Definitely worthwhile to be able to disable GRO handlers
to reduce attack surface to bad input.
>
> best
> Yan
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-23 8:27 ` Willem de Bruijn
@ 2024-06-24 18:17 ` Yan Zhai
2024-06-30 13:40 ` Willem de Bruijn
0 siblings, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-24 18:17 UTC (permalink / raw)
To: Willem de Bruijn
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > > {
> > > > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > > return true;
> > > > +
> > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > + return skb->gro_disabled;
> > > > +#else
> > > > return false;
> > > > +#endif
> > >
> > > Yet more branches in the hot path.
> > >
> > > Compile time configurability does not help, as that will be
> > > enabled by distros.
> > >
> > > For a fairly niche use case. Where functionality of GRO already
> > > works. So just a performance for a very rare case at the cost of a
> > > regression in the common case. A small regression perhaps, but death
> > > by a thousand cuts.
> > >
> >
> > I share your concern on operating on this hotpath. Will a
> > static_branch + sysctl make it less aggressive?
>
> That is always a possibility. But we have to use it judiciously,
> cannot add a sysctl for every branch.
>
> I'm still of the opinion that Paolo shared that this seems a lot of
> complexity for a fairly minor performance optimization for a rare
> case.
>
Actually combining the discussion in this thread, I think it would be
more than the corner cases that we encounter. Let me elaborate below.
> > Speaking of
> > performance, I'd hope this can give us more control so we can achieve
> > the best of two worlds: for TCP and some UDP traffic, we can enable
> > GRO, while for some other classes that we know GRO does no good or
> > even harm, let's disable GRO to save more cycles. The key observation
> > is that developers may already know which traffic is blessed by GRO,
> > but lack a way to realize it.
>
> Following up also on Daniel's point on using BPF as GRO engine. Even
> earlier I tried to add an option to selectively enable GRO protocols
> without BPF. Definitely worthwhile to be able to disable GRO handlers
> to reduce attack surface to bad input.
>
I was probably staring too hard at my own things, which is indeed a
corner case. But reducing the attack surface is indeed a good
motivation for this patch. I checked briefly with our DoS team today,
the DoS scenario will definitely benefit from skipping GRO, for
example on SYN/RST floods. XDP is our main weapon to drop attack
traffic today, but it does not always drop 100% of the floods, and
time by time it does need to fall back to iptables due to the delay of
XDP program assembly or the BPF limitation on analyzing the packet. I
did an ad hoc measurement just now on a mostly idle server, with
~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
time, while w/o GRO the time dropped to 9-12%. This seems a pretty
significant breath room under heavy attacks.
But I am not sure I understand "BPF as GRO engine" here, it seems to
me that being able to disable GRO by XDP is already good enough. Any
more motivations to do more complex work here?
best
Yan
>
> >
> > best
> > Yan
>
>
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-24 18:17 ` Yan Zhai
@ 2024-06-30 13:40 ` Willem de Bruijn
2024-07-03 18:46 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Willem de Bruijn @ 2024-06-30 13:40 UTC (permalink / raw)
To: Yan Zhai, Willem de Bruijn
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
Yan Zhai wrote:
> On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Yan Zhai wrote:
> > > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > > > {
> > > > > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > > > return true;
> > > > > +
> > > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > > + return skb->gro_disabled;
> > > > > +#else
> > > > > return false;
> > > > > +#endif
> > > >
> > > > Yet more branches in the hot path.
> > > >
> > > > Compile time configurability does not help, as that will be
> > > > enabled by distros.
> > > >
> > > > For a fairly niche use case. Where functionality of GRO already
> > > > works. So just a performance for a very rare case at the cost of a
> > > > regression in the common case. A small regression perhaps, but death
> > > > by a thousand cuts.
> > > >
> > >
> > > I share your concern on operating on this hotpath. Will a
> > > static_branch + sysctl make it less aggressive?
> >
> > That is always a possibility. But we have to use it judiciously,
> > cannot add a sysctl for every branch.
> >
> > I'm still of the opinion that Paolo shared that this seems a lot of
> > complexity for a fairly minor performance optimization for a rare
> > case.
> >
> Actually combining the discussion in this thread, I think it would be
> more than the corner cases that we encounter. Let me elaborate below.
>
> > > Speaking of
> > > performance, I'd hope this can give us more control so we can achieve
> > > the best of two worlds: for TCP and some UDP traffic, we can enable
> > > GRO, while for some other classes that we know GRO does no good or
> > > even harm, let's disable GRO to save more cycles. The key observation
> > > is that developers may already know which traffic is blessed by GRO,
> > > but lack a way to realize it.
> >
> > Following up also on Daniel's point on using BPF as GRO engine. Even
> > earlier I tried to add an option to selectively enable GRO protocols
> > without BPF. Definitely worthwhile to be able to disable GRO handlers
> > to reduce attack surface to bad input.
> >
> I was probably staring too hard at my own things, which is indeed a
> corner case. But reducing the attack surface is indeed a good
> motivation for this patch. I checked briefly with our DoS team today,
> the DoS scenario will definitely benefit from skipping GRO, for
> example on SYN/RST floods. XDP is our main weapon to drop attack
> traffic today, but it does not always drop 100% of the floods, and
> time by time it does need to fall back to iptables due to the delay of
> XDP program assembly or the BPF limitation on analyzing the packet. I
> did an ad hoc measurement just now on a mostly idle server, with
> ~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
> raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
> time, while w/o GRO the time dropped to 9-12%. This seems a pretty
> significant breath room under heavy attacks.
A GRO opt-out might make sense.
A long time ago I sent a patch that configured GRO protocols using
syscalls, selectively (un)registering handlers. The interface was not
very nice, so I did not pursue it further. On the upside, the datapath
did not introduce any extra code. The intent was to reduce attack
surface of packet parsing code.
A few concerns with an XDP based opt-out. It is more work to enable:
requires compiling and load an XDP program. It adds cycles in the
hot path. And I do not entirely understand when an XDP program will be
able to detect that a packet should not enter the GRO engine, but
cannot drop the packet (your netfilter example above).
> But I am not sure I understand "BPF as GRO engine" here, it seems to
> me that being able to disable GRO by XDP is already good enough. Any
> more motivations to do more complex work here?
FWIW, we looked into this a few years ago. Analogous to the BPF flow
dissector: if the BPF program is loaded, use that instead of the C
code path. But we did not arrive at a practical implementation at the
time. Things may have changed, but one issue is how to store and
access the list (or table) of outstanding GRO skbs.
> best
> Yan
>
> >
> > >
> > > best
> > > Yan
> >
> >
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 1/9] skb: introduce gro_disabled bit
2024-06-30 13:40 ` Willem de Bruijn
@ 2024-07-03 18:46 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-07-03 18:46 UTC (permalink / raw)
To: Willem de Bruijn
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Willem de Bruijn,
Simon Horman, Florian Westphal, Mina Almasry, Abhishek Chauhan,
David Howells, Alexander Lobakin, David Ahern, Richard Gobert,
Antoine Tenart, Felix Fietkau, Soheil Hassas Yeganeh,
Pavel Begunkov, Lorenzo Bianconi, Thomas Weißschuh,
linux-kernel, bpf
On Sun, Jun 30, 2024 at 8:40 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Yan Zhai wrote:
> > > > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > > > > {
> > > > > > - if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > > > + if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > > > > return true;
> > > > > > +
> > > > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > > > + return skb->gro_disabled;
> > > > > > +#else
> > > > > > return false;
> > > > > > +#endif
> > > > >
> > > > > Yet more branches in the hot path.
> > > > >
> > > > > Compile time configurability does not help, as that will be
> > > > > enabled by distros.
> > > > >
> > > > > For a fairly niche use case. Where functionality of GRO already
> > > > > works. So just a performance for a very rare case at the cost of a
> > > > > regression in the common case. A small regression perhaps, but death
> > > > > by a thousand cuts.
> > > > >
> > > >
> > > > I share your concern on operating on this hotpath. Will a
> > > > static_branch + sysctl make it less aggressive?
> > >
> > > That is always a possibility. But we have to use it judiciously,
> > > cannot add a sysctl for every branch.
> > >
> > > I'm still of the opinion that Paolo shared that this seems a lot of
> > > complexity for a fairly minor performance optimization for a rare
> > > case.
> > >
> > Actually combining the discussion in this thread, I think it would be
> > more than the corner cases that we encounter. Let me elaborate below.
> >
> > > > Speaking of
> > > > performance, I'd hope this can give us more control so we can achieve
> > > > the best of two worlds: for TCP and some UDP traffic, we can enable
> > > > GRO, while for some other classes that we know GRO does no good or
> > > > even harm, let's disable GRO to save more cycles. The key observation
> > > > is that developers may already know which traffic is blessed by GRO,
> > > > but lack a way to realize it.
> > >
> > > Following up also on Daniel's point on using BPF as GRO engine. Even
> > > earlier I tried to add an option to selectively enable GRO protocols
> > > without BPF. Definitely worthwhile to be able to disable GRO handlers
> > > to reduce attack surface to bad input.
> > >
> > I was probably staring too hard at my own things, which is indeed a
> > corner case. But reducing the attack surface is indeed a good
> > motivation for this patch. I checked briefly with our DoS team today,
> > the DoS scenario will definitely benefit from skipping GRO, for
> > example on SYN/RST floods. XDP is our main weapon to drop attack
> > traffic today, but it does not always drop 100% of the floods, and
> > time by time it does need to fall back to iptables due to the delay of
> > XDP program assembly or the BPF limitation on analyzing the packet. I
> > did an ad hoc measurement just now on a mostly idle server, with
> > ~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
> > raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
> > time, while w/o GRO the time dropped to 9-12%. This seems a pretty
> > significant breath room under heavy attacks.
>
> A GRO opt-out might make sense.
>
> A long time ago I sent a patch that configured GRO protocols using
> syscalls, selectively (un)registering handlers. The interface was not
> very nice, so I did not pursue it further. On the upside, the datapath
> did not introduce any extra code. The intent was to reduce attack
> surface of packet parsing code.
>
> A few concerns with an XDP based opt-out. It is more work to enable:
> requires compiling and load an XDP program. It adds cycles in the
> hot path. And I do not entirely understand when an XDP program will be
> able to detect that a packet should not enter the GRO engine, but
> cannot drop the packet (your netfilter example above).
>
Agree that XDP based approach is just offering for XDP users. But
given the way GRO works on flows today, it feels really hard to
provide an elegant and generic interface.
For DoS scenarios, let me expand it a bit. Packets themselves could be
a good indicator that they should not go through GRO, like fragments,
or with special flags like SYN/RST/PSH. Under an attack, we sometimes
also need conntrack or SYN cookies to help determine if some packets
are legit or not. We have a few kfuncs to lookup conntrack entries in
XDP today, but I am not sure if we can confidently drop them without
completely mirroring full conntrack functionality. Rather, using
conntrack as extra heuristics to mark suspicious packets in XDP, like
TCP packets out of windows, etc, and still leave verdict to iptables
seems a safer thing to do. I did observe a few occurrences in the past
where a substantial amount of SYN flood passed through XDP, with some
clever tricks in faking flow headers. Those were eventually dealt by
SYN cookies, but all of those go through GRO unnecessarily although
they all carry a SYN flag. Would be definitely beneficial to save
every cycle under attacks.
> > But I am not sure I understand "BPF as GRO engine" here, it seems to
> > me that being able to disable GRO by XDP is already good enough. Any
> > more motivations to do more complex work here?
>
> FWIW, we looked into this a few years ago. Analogous to the BPF flow
> dissector: if the BPF program is loaded, use that instead of the C
> code path. But we did not arrive at a practical implementation at the
> time. Things may have changed, but one issue is how to store and
> access the list (or table) of outstanding GRO skbs.
>
I see, thanks for the explanation.
Yan
> > best
> > Yan
> >
> > >
> > > >
> > > > best
> > > > Yan
> > >
> > >
>
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-21 9:15 ` Alexander Lobakin
2024-06-20 22:19 ` [RFC net-next 3/9] xdp: implement bpf_xdp_disable_gro kfunc Yan Zhai
` (6 subsequent siblings)
8 siblings, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Alexei Starovoitov, Daniel Borkmann, David S. Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Eric Dumazet, Paolo Abeni, netdev, bpf, linux-kernel
Allow XDP program to set XDP_FLAGS_GRO_DISABLED flag in xdp_buff and
xdp_frame, and disable GRO when building an sk_buff if this flag is set.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
include/net/xdp.h | 38 +++++++++++++++++++++++++++++++++++++-
1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/include/net/xdp.h b/include/net/xdp.h
index e6770dd40c91..cc3bce8817b0 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -75,6 +75,7 @@ enum xdp_buff_flags {
XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under
* pressure
*/
+ XDP_FLAGS_GRO_DISABLED = BIT(2), /* GRO disabled */
};
struct xdp_buff {
@@ -113,12 +114,35 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
+static __always_inline void xdp_buff_disable_gro(struct xdp_buff *xdp)
+{
+ xdp->flags |= XDP_FLAGS_GRO_DISABLED;
+}
+
+static __always_inline bool xdp_buff_gro_disabled(struct xdp_buff *xdp)
+{
+ return !!(xdp->flags & XDP_FLAGS_GRO_DISABLED);
+}
+
+static __always_inline void
+xdp_buff_fixup_skb_offloading(struct xdp_buff *xdp, struct sk_buff *skb)
+{
+ if (xdp_buff_gro_disabled(xdp))
+ skb_disable_gro(skb);
+}
+
+static __always_inline void
+xdp_init_buff_minimal(struct xdp_buff *xdp)
+{
+ xdp->flags = 0;
+}
+
static __always_inline void
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
{
xdp->frame_sz = frame_sz;
xdp->rxq = rxq;
- xdp->flags = 0;
+ xdp_init_buff_minimal(xdp);
}
static __always_inline void
@@ -187,6 +211,18 @@ static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame
return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
}
+static __always_inline bool xdp_frame_gro_disabled(struct xdp_frame *frame)
+{
+ return !!(frame->flags & XDP_FLAGS_GRO_DISABLED);
+}
+
+static __always_inline void
+xdp_frame_fixup_skb_offloading(struct xdp_frame *frame, struct sk_buff *skb)
+{
+ if (xdp_frame_gro_disabled(frame))
+ skb_disable_gro(skb);
+}
+
#define XDP_BULK_QUEUE_SIZE 16
struct xdp_frame_bulk {
int count;
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag
2024-06-20 22:19 ` [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag Yan Zhai
@ 2024-06-21 9:15 ` Alexander Lobakin
2024-06-21 16:12 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Alexander Lobakin @ 2024-06-21 9:15 UTC (permalink / raw)
To: Yan Zhai
Cc: netdev, Alexei Starovoitov, Daniel Borkmann, David S. Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Eric Dumazet, Paolo Abeni, bpf, linux-kernel
From: Yan Zhai <yan@cloudflare.com>
Date: Thu, 20 Jun 2024 15:19:13 -0700
> Allow XDP program to set XDP_FLAGS_GRO_DISABLED flag in xdp_buff and
> xdp_frame, and disable GRO when building an sk_buff if this flag is set.
>
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
> include/net/xdp.h | 38 +++++++++++++++++++++++++++++++++++++-
> 1 file changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index e6770dd40c91..cc3bce8817b0 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -75,6 +75,7 @@ enum xdp_buff_flags {
> XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under
> * pressure
> */
> + XDP_FLAGS_GRO_DISABLED = BIT(2), /* GRO disabled */
There should be tabs, not spaces.
> };
>
> struct xdp_buff {
> @@ -113,12 +114,35 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
> xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
> }
>
> +static __always_inline void xdp_buff_disable_gro(struct xdp_buff *xdp)
> +{
> + xdp->flags |= XDP_FLAGS_GRO_DISABLED;
> +}
> +
> +static __always_inline bool xdp_buff_gro_disabled(struct xdp_buff *xdp)
> +{
> + return !!(xdp->flags & XDP_FLAGS_GRO_DISABLED);
> +}
> +
> +static __always_inline void
> +xdp_buff_fixup_skb_offloading(struct xdp_buff *xdp, struct sk_buff *skb)
> +{
> + if (xdp_buff_gro_disabled(xdp))
> + skb_disable_gro(skb);
> +}
I don't think this should be named "fixup". "propagate", "update",
"promote", ...?
Maybe `if` is not needed here?
skb->gro_disabled = xdp_buff_gro_disabled(xdp)
?
> +
> +static __always_inline void
> +xdp_init_buff_minimal(struct xdp_buff *xdp)
> +{
> + xdp->flags = 0;
> +}
"xdp_buff_clear_flags"?
> +
> static __always_inline void
> xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
> {
> xdp->frame_sz = frame_sz;
> xdp->rxq = rxq;
> - xdp->flags = 0;
> + xdp_init_buff_minimal(xdp);
> }
>
> static __always_inline void
> @@ -187,6 +211,18 @@ static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame
> return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
> }
>
> +static __always_inline bool xdp_frame_gro_disabled(struct xdp_frame *frame)
> +{
> + return !!(frame->flags & XDP_FLAGS_GRO_DISABLED);
> +}
> +
> +static __always_inline void
> +xdp_frame_fixup_skb_offloading(struct xdp_frame *frame, struct sk_buff *skb)
> +{
> + if (xdp_frame_gro_disabled(frame))
> + skb_disable_gro(skb);
> +}
(same)
> +
> #define XDP_BULK_QUEUE_SIZE 16
> struct xdp_frame_bulk {
> int count;
Thanks,
Olek
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag
2024-06-21 9:15 ` Alexander Lobakin
@ 2024-06-21 16:12 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 16:12 UTC (permalink / raw)
To: Alexander Lobakin
Cc: netdev, Alexei Starovoitov, Daniel Borkmann, David S. Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Eric Dumazet, Paolo Abeni, bpf, linux-kernel
On Fri, Jun 21, 2024 at 4:17 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> From: Yan Zhai <yan@cloudflare.com>
> Date: Thu, 20 Jun 2024 15:19:13 -0700
>
> > Allow XDP program to set XDP_FLAGS_GRO_DISABLED flag in xdp_buff and
> > xdp_frame, and disable GRO when building an sk_buff if this flag is set.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> > include/net/xdp.h | 38 +++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 37 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index e6770dd40c91..cc3bce8817b0 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -75,6 +75,7 @@ enum xdp_buff_flags {
> > XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under
> > * pressure
> > */
> > + XDP_FLAGS_GRO_DISABLED = BIT(2), /* GRO disabled */
>
> There should be tabs, not spaces.
>
> > };
> >
> > struct xdp_buff {
> > @@ -113,12 +114,35 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
> > xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
> > }
> >
> > +static __always_inline void xdp_buff_disable_gro(struct xdp_buff *xdp)
> > +{
> > + xdp->flags |= XDP_FLAGS_GRO_DISABLED;
> > +}
> > +
> > +static __always_inline bool xdp_buff_gro_disabled(struct xdp_buff *xdp)
> > +{
> > + return !!(xdp->flags & XDP_FLAGS_GRO_DISABLED);
> > +}
> > +
> > +static __always_inline void
> > +xdp_buff_fixup_skb_offloading(struct xdp_buff *xdp, struct sk_buff *skb)
> > +{
> > + if (xdp_buff_gro_disabled(xdp))
> > + skb_disable_gro(skb);
> > +}
>
> I don't think this should be named "fixup". "propagate", "update",
> "promote", ...?
>
> Maybe `if` is not needed here?
>
> skb->gro_disabled = xdp_buff_gro_disabled(xdp)
>
> ?
>
I called it fixup mainly considering current skb fields could be set
incorrectly (or maybe not?) For example when XDP load balances
traffic, it could encap and send from server A to server B, but that
means HW on B may have no good way to calculate the actual flow
hash/csum. The original values could be read from A and sent within
the encapsulation header, so that B can use this value to "fixup" the
things.
But tbh, I am never good at naming, it's too hard for a non-native
speaker sometimes... So I am very open on what to use if majority of
the participants like it :D
The if is still needed since I added a control config to wrap that bit.
best
Yan
> > +
> > +static __always_inline void
> > +xdp_init_buff_minimal(struct xdp_buff *xdp)
> > +{
> > + xdp->flags = 0;
> > +}
>
> "xdp_buff_clear_flags"?
>
> > +
> > static __always_inline void
> > xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
> > {
> > xdp->frame_sz = frame_sz;
> > xdp->rxq = rxq;
> > - xdp->flags = 0;
> > + xdp_init_buff_minimal(xdp);
> > }
> >
> > static __always_inline void
> > @@ -187,6 +211,18 @@ static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame
> > return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
> > }
> >
> > +static __always_inline bool xdp_frame_gro_disabled(struct xdp_frame *frame)
> > +{
> > + return !!(frame->flags & XDP_FLAGS_GRO_DISABLED);
> > +}
> > +
> > +static __always_inline void
> > +xdp_frame_fixup_skb_offloading(struct xdp_frame *frame, struct sk_buff *skb)
> > +{
> > + if (xdp_frame_gro_disabled(frame))
> > + skb_disable_gro(skb);
> > +}
>
> (same)
>
> > +
> > #define XDP_BULK_QUEUE_SIZE 16
> > struct xdp_frame_bulk {
> > int count;
>
> Thanks,
> Olek
^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC net-next 3/9] xdp: implement bpf_xdp_disable_gro kfunc
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
2024-06-20 22:19 ` [RFC net-next 1/9] skb: introduce gro_disabled bit Yan Zhai
2024-06-20 22:19 ` [RFC net-next 2/9] xdp: add XDP_FLAGS_GRO_DISABLED flag Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-20 22:19 ` [RFC net-next 4/9] bnxt: apply XDP offloading fixup when building skb Yan Zhai
` (5 subsequent siblings)
8 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Alexei Starovoitov, Daniel Borkmann, David S. Miller,
Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
Eric Dumazet, Paolo Abeni, netdev, bpf, linux-kernel
Add a kfunc bpf_xdp_disable_gro to toggle XDP_FLAGS_GRO_DISABLED
flag.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
net/core/xdp.c | 27 ++++++++++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 41693154e426..d6e5f98a0081 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -770,6 +770,20 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
return -EOPNOTSUPP;
}
+/**
+ * bpf_xdp_disable_gro - Set a flag on underlying XDP buffer, indicating
+ * the stack to skip this packet for GRO processing. This flag which be
+ * passed on when the driver builds the skb.
+ *
+ * Return:
+ * * always returns 0
+ */
+__bpf_kfunc int bpf_xdp_disable_gro(struct xdp_md *ctx)
+{
+ xdp_buff_disable_gro((struct xdp_buff *)ctx);
+ return 0;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
@@ -799,9 +813,20 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id)
return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id);
}
+BTF_KFUNCS_START(xdp_common_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_xdp_disable_gro, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(xdp_common_kfunc_ids)
+
+static const struct btf_kfunc_id_set xdp_common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xdp_common_kfunc_ids,
+};
+
static int __init xdp_metadata_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+ int ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_common_kfunc_set);
}
late_initcall(xdp_metadata_init);
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* [RFC net-next 4/9] bnxt: apply XDP offloading fixup when building skb
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (2 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 3/9] xdp: implement bpf_xdp_disable_gro kfunc Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-20 22:19 ` [RFC net-next 5/9] ice: " Yan Zhai
` (4 subsequent siblings)
8 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Michael Chan, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, netdev, linux-kernel, bpf
Add a common point to transfer offloading info from XDP context to skb.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7dc00c0d8992..0036c4752f0d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2252,6 +2252,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
}
}
}
+
+ if (xdp_active)
+ xdp_buff_fixup_skb_offloading(&xdp, skb);
+
bnxt_deliver_skb(bp, bnapi, skb);
rc = 1;
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* [RFC net-next 5/9] ice: apply XDP offloading fixup when building skb
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (3 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 4/9] bnxt: apply XDP offloading fixup when building skb Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-21 9:20 ` Alexander Lobakin
2024-06-20 22:19 ` [RFC net-next 6/9] veth: " Yan Zhai
` (3 subsequent siblings)
8 siblings, 1 reply; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Jesse Brandeburg, Tony Nguyen, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Björn Töpel,
Magnus Karlsson, Maciej Fijalkowski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, intel-wired-lan, netdev, linux-kernel, bpf
Add a common point to transfer offloading info from XDP context to skb.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
drivers/net/ethernet/intel/ice/ice_txrx.c | 2 ++
drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++++-
include/net/xdp_sock_drv.h | 2 +-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..a247306837ed 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1222,6 +1222,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
offset;
+ xdp_init_buff_minimal(xdp);
xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
#if (PAGE_SIZE > 4096)
/* At larger PAGE_SIZE, frame_sz depend on len size */
@@ -1287,6 +1288,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
/* populate checksum, VLAN, and protocol */
ice_process_skb_fields(rx_ring, rx_desc, skb);
+ xdp_buff_fixup_skb_offloading(xdp, skb);
ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
/* send completed skb up the stack */
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index a65955eb23c0..367658acaab8 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -845,8 +845,10 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
xdp_ring = rx_ring->xdp_ring;
- if (ntc != rx_ring->first_desc)
+ if (ntc != rx_ring->first_desc) {
first = *ice_xdp_buf(rx_ring, rx_ring->first_desc);
+ xdp_init_buff_minimal(first);
+ }
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
@@ -920,6 +922,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
break;
}
+ xdp = first;
first = NULL;
rx_ring->first_desc = ntc;
@@ -934,6 +937,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
vlan_tci = ice_get_vlan_tci(rx_desc);
ice_process_skb_fields(rx_ring, rx_desc, skb);
+ xdp_buff_fixup_skb_offloading(xdp, skb);
ice_receive_skb(rx_ring, skb, vlan_tci);
}
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 0a5dca2b2b3f..02243dc064c2 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -181,7 +181,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
xdp->data_meta = xdp->data;
xdp->data_end = xdp->data + size;
- xdp->flags = 0;
+ xdp_init_buff_minimal(xdp);
}
static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [RFC net-next 5/9] ice: apply XDP offloading fixup when building skb
2024-06-20 22:19 ` [RFC net-next 5/9] ice: " Yan Zhai
@ 2024-06-21 9:20 ` Alexander Lobakin
2024-06-21 16:05 ` Yan Zhai
0 siblings, 1 reply; 33+ messages in thread
From: Alexander Lobakin @ 2024-06-21 9:20 UTC (permalink / raw)
To: Yan Zhai
Cc: netdev, Jesse Brandeburg, Tony Nguyen, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Björn Töpel,
Magnus Karlsson, Maciej Fijalkowski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, intel-wired-lan, linux-kernel, bpf
From: Yan Zhai <yan@cloudflare.com>
Date: Thu, 20 Jun 2024 15:19:22 -0700
> Add a common point to transfer offloading info from XDP context to skb.
>
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
> drivers/net/ethernet/intel/ice/ice_txrx.c | 2 ++
> drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++++-
> include/net/xdp_sock_drv.h | 2 +-
> 3 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> index 8bb743f78fcb..a247306837ed 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> @@ -1222,6 +1222,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
>
> hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
> offset;
> + xdp_init_buff_minimal(xdp);
Two lines below, you have this:
xdp_buff_clear_frags_flag(xdp);
Which clears frags bit in xdp->flags. I.e. since you always clear flags
here, this call becomes redundant.
But I'd say that `xdp->flags = 0` really wants to be moved from
xdp_init_buff() to xdp_prepare_buff().
> xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
> #if (PAGE_SIZE > 4096)
> /* At larger PAGE_SIZE, frame_sz depend on len size */
> @@ -1287,6 +1288,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
>
> /* populate checksum, VLAN, and protocol */
> ice_process_skb_fields(rx_ring, rx_desc, skb);
> + xdp_buff_fixup_skb_offloading(xdp, skb);
>
> ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
> /* send completed skb up the stack */
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index a65955eb23c0..367658acaab8 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -845,8 +845,10 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> xdp_prog = READ_ONCE(rx_ring->xdp_prog);
> xdp_ring = rx_ring->xdp_ring;
>
> - if (ntc != rx_ring->first_desc)
> + if (ntc != rx_ring->first_desc) {
> first = *ice_xdp_buf(rx_ring, rx_ring->first_desc);
> + xdp_init_buff_minimal(first);
xdp_buff_set_size() always clears flags, this is redundant.
> + }
>
> while (likely(total_rx_packets < (unsigned int)budget)) {
> union ice_32b_rx_flex_desc *rx_desc;
> @@ -920,6 +922,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> break;
> }
>
> + xdp = first;
> first = NULL;
> rx_ring->first_desc = ntc;
>
> @@ -934,6 +937,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> vlan_tci = ice_get_vlan_tci(rx_desc);
>
> ice_process_skb_fields(rx_ring, rx_desc, skb);
> + xdp_buff_fixup_skb_offloading(xdp, skb);
> ice_receive_skb(rx_ring, skb, vlan_tci);
> }
>
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 0a5dca2b2b3f..02243dc064c2 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -181,7 +181,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> xdp->data_meta = xdp->data;
> xdp->data_end = xdp->data + size;
> - xdp->flags = 0;
> + xdp_init_buff_minimal(xdp);
Why is this done in the patch prefixed with "ice:"?
> }
>
> static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
Thanks,
Olek
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [RFC net-next 5/9] ice: apply XDP offloading fixup when building skb
2024-06-21 9:20 ` Alexander Lobakin
@ 2024-06-21 16:05 ` Yan Zhai
0 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-21 16:05 UTC (permalink / raw)
To: Alexander Lobakin
Cc: netdev, Jesse Brandeburg, Tony Nguyen, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Björn Töpel,
Magnus Karlsson, Maciej Fijalkowski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, intel-wired-lan, linux-kernel, bpf
On Fri, Jun 21, 2024 at 4:22 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> From: Yan Zhai <yan@cloudflare.com>
> Date: Thu, 20 Jun 2024 15:19:22 -0700
>
> > Add a common point to transfer offloading info from XDP context to skb.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> > ---
> > drivers/net/ethernet/intel/ice/ice_txrx.c | 2 ++
> > drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++++-
> > include/net/xdp_sock_drv.h | 2 +-
> > 3 files changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > index 8bb743f78fcb..a247306837ed 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > @@ -1222,6 +1222,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
> >
> > hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
> > offset;
> > + xdp_init_buff_minimal(xdp);
>
> Two lines below, you have this:
>
> xdp_buff_clear_frags_flag(xdp);
>
> Which clears frags bit in xdp->flags. I.e. since you always clear flags
> here, this call becomes redundant.
> But I'd say that `xdp->flags = 0` really wants to be moved from
> xdp_init_buff() to xdp_prepare_buff().
>
You are right, there is some redundancy here. I will fix it if people
feel good about the use case in general :)
> > xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
> > #if (PAGE_SIZE > 4096)
> > /* At larger PAGE_SIZE, frame_sz depend on len size */
> > @@ -1287,6 +1288,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
> >
> > /* populate checksum, VLAN, and protocol */
> > ice_process_skb_fields(rx_ring, rx_desc, skb);
> > + xdp_buff_fixup_skb_offloading(xdp, skb);
> >
> > ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
> > /* send completed skb up the stack */
> > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > index a65955eb23c0..367658acaab8 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > @@ -845,8 +845,10 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> > xdp_prog = READ_ONCE(rx_ring->xdp_prog);
> > xdp_ring = rx_ring->xdp_ring;
> >
> > - if (ntc != rx_ring->first_desc)
> > + if (ntc != rx_ring->first_desc) {
> > first = *ice_xdp_buf(rx_ring, rx_ring->first_desc);
> > + xdp_init_buff_minimal(first);
>
> xdp_buff_set_size() always clears flags, this is redundant.
>
> > + }
> >
> > while (likely(total_rx_packets < (unsigned int)budget)) {
> > union ice_32b_rx_flex_desc *rx_desc;
> > @@ -920,6 +922,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> > break;
> > }
> >
> > + xdp = first;
> > first = NULL;
> > rx_ring->first_desc = ntc;
> >
> > @@ -934,6 +937,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> > vlan_tci = ice_get_vlan_tci(rx_desc);
> >
> > ice_process_skb_fields(rx_ring, rx_desc, skb);
> > + xdp_buff_fixup_skb_offloading(xdp, skb);
> > ice_receive_skb(rx_ring, skb, vlan_tci);
> > }
> >
> > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> > index 0a5dca2b2b3f..02243dc064c2 100644
> > --- a/include/net/xdp_sock_drv.h
> > +++ b/include/net/xdp_sock_drv.h
> > @@ -181,7 +181,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> > xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> > xdp->data_meta = xdp->data;
> > xdp->data_end = xdp->data + size;
> > - xdp->flags = 0;
> > + xdp_init_buff_minimal(xdp);
>
> Why is this done in the patch prefixed with "ice:"?
>
Good catch, this should be moved to the previous patch.
thanks
Yan
> > }
> >
> > static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
>
> Thanks,
> Olek
^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC net-next 6/9] veth: apply XDP offloading fixup when building skb
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (4 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 5/9] ice: " Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-20 22:19 ` [RFC net-next 7/9] mlx5: move xdp_buff scope one level up Jesper Dangaard Brouer
` (2 subsequent siblings)
8 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, netdev, linux-kernel, bpf
Add a common point to transfer offloading info from XDP context to skb.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
drivers/net/veth.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 426e68a95067..c799362a839c 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -703,6 +703,8 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
stats->rx_drops++;
continue;
}
+
+ xdp_frame_fixup_skb_offloading(frames[i], skb);
napi_gro_receive(&rq->xdp_napi, skb);
}
}
@@ -855,6 +857,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
+
+ xdp_buff_fixup_skb_offloading(xdp, skb);
out:
return skb;
drop:
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* [RFC net-next 7/9] mlx5: move xdp_buff scope one level up
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (5 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 6/9] veth: " Yan Zhai
@ 2024-06-20 22:19 ` Jesper Dangaard Brouer
2024-06-20 22:19 ` [RFC net-next 8/9] mlx5: apply XDP offloading fixup when building skb Yan Zhai
2024-06-20 22:19 ` [RFC net-next 9/9] bpf: selftests: test disabling GRO by XDP Yan Zhai
8 siblings, 0 replies; 33+ messages in thread
From: Jesper Dangaard Brouer @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend, Yan Zhai,
Dragos Tatulea, Alexander Lobakin, netdev, linux-rdma,
linux-kernel, bpf
This is in preparation for changes.
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +-
.../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 6 +-
.../ethernet/mellanox/mlx5/core/en/xsk/rx.h | 6 +-
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 103 +++++++++---------
4 files changed, 66 insertions(+), 55 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6a343a8f162f..3d26f976f692 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -580,14 +580,16 @@ struct mlx5e_mpw_info {
#define MLX5E_MAX_RX_FRAGS 4
struct mlx5e_rq;
+struct mlx5e_xdp_buff;
typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
typedef struct sk_buff *
(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5_cqe64 *cqe, u16 cqe_bcnt,
- u32 head_offset, u32 page_idx);
+ u32 head_offset, u32 page_idx,
+ struct mlx5e_xdp_buff *mxbuf);
typedef struct sk_buff *
(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- struct mlx5_cqe64 *cqe, u32 cqe_bcnt);
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt, struct mlx5e_xdp_buff *mxbuf);
typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
typedef void (*mlx5e_fp_shampo_dealloc_hd)(struct mlx5e_rq*, u16, u16, bool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 1b7132fa70de..4dacaa61e106 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -249,7 +249,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe,
u16 cqe_bcnt,
u32 head_offset,
- u32 page_idx)
+ u32 page_idx,
+ struct mlx5e_xdp_buff *mxbuf_)
{
struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units.xsk_buffs[page_idx]);
struct bpf_prog *prog;
@@ -304,7 +305,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
struct mlx5_cqe64 *cqe,
- u32 cqe_bcnt)
+ u32 cqe_bcnt,
+ struct mlx5e_xdp_buff *mxbuf_)
{
struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(*wi->xskp);
struct bpf_prog *prog;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index cefc0ef6105d..0890c975042c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -16,10 +16,12 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe,
u16 cqe_bcnt,
u32 head_offset,
- u32 page_idx);
+ u32 page_idx,
+ struct mlx5e_xdp_buff *mxbuf_);
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
struct mlx5_cqe64 *cqe,
- u32 cqe_bcnt);
+ u32 cqe_bcnt,
+ struct mlx5e_xdp_buff *mxbuf_);
#endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 225da8d691fc..1a592a1ab988 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -63,11 +63,11 @@
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
- u32 page_idx);
+ u32 page_idx, struct mlx5e_xdp_buff *mxbuf);
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
- u32 page_idx);
+ u32 page_idx, struct mlx5e_xdp_buff *mxbuf);
static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
@@ -1658,7 +1658,8 @@ static void mlx5e_fill_mxbuf(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
static struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- struct mlx5_cqe64 *cqe, u32 cqe_bcnt)
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt,
+ struct mlx5e_xdp_buff *mxbuf)
{
struct mlx5e_frag_page *frag_page = wi->frag_page;
u16 rx_headroom = rq->buff.headroom;
@@ -1680,17 +1681,15 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
- struct mlx5e_xdp_buff mxbuf;
-
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, rq->buff.frame0_sz,
- cqe_bcnt, &mxbuf);
- if (mlx5e_xdp_handle(rq, prog, &mxbuf))
+ cqe_bcnt, mxbuf);
+ if (mlx5e_xdp_handle(rq, prog, mxbuf))
return NULL; /* page/packet was consumed by XDP */
- rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start;
- metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta;
- cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data;
+ rx_headroom = mxbuf->xdp.data - mxbuf->xdp.data_hard_start;
+ metasize = mxbuf->xdp.data - mxbuf->xdp.data_meta;
+ cqe_bcnt = mxbuf->xdp.data_end - mxbuf->xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
@@ -1706,14 +1705,14 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
static struct sk_buff *
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- struct mlx5_cqe64 *cqe, u32 cqe_bcnt)
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt,
+ struct mlx5e_xdp_buff *mxbuf)
{
struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
struct mlx5e_wqe_frag_info *head_wi = wi;
u16 rx_headroom = rq->buff.headroom;
struct mlx5e_frag_page *frag_page;
struct skb_shared_info *sinfo;
- struct mlx5e_xdp_buff mxbuf;
u32 frag_consumed_bytes;
struct bpf_prog *prog;
struct sk_buff *skb;
@@ -1733,8 +1732,8 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
net_prefetch(va + rx_headroom);
mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, rq->buff.frame0_sz,
- frag_consumed_bytes, &mxbuf);
- sinfo = xdp_get_shared_info_from_buff(&mxbuf.xdp);
+ frag_consumed_bytes, mxbuf);
+ sinfo = xdp_get_shared_info_from_buff(&mxbuf->xdp);
truesize = 0;
cqe_bcnt -= frag_consumed_bytes;
@@ -1746,7 +1745,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
frag_consumed_bytes = min_t(u32, frag_info->frag_size, cqe_bcnt);
- mlx5e_add_skb_shared_info_frag(rq, sinfo, &mxbuf.xdp, frag_page,
+ mlx5e_add_skb_shared_info_frag(rq, sinfo, &mxbuf->xdp, frag_page,
wi->offset, frag_consumed_bytes);
truesize += frag_info->frag_stride;
@@ -1756,7 +1755,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
}
prog = rcu_dereference(rq->xdp_prog);
- if (prog && mlx5e_xdp_handle(rq, prog, &mxbuf)) {
+ if (prog && mlx5e_xdp_handle(rq, prog, mxbuf)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
struct mlx5e_wqe_frag_info *pwi;
@@ -1766,21 +1765,21 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
return NULL; /* page/packet was consumed by XDP */
}
- skb = mlx5e_build_linear_skb(rq, mxbuf.xdp.data_hard_start, rq->buff.frame0_sz,
- mxbuf.xdp.data - mxbuf.xdp.data_hard_start,
- mxbuf.xdp.data_end - mxbuf.xdp.data,
- mxbuf.xdp.data - mxbuf.xdp.data_meta);
+ skb = mlx5e_build_linear_skb(rq, mxbuf->xdp.data_hard_start, rq->buff.frame0_sz,
+ mxbuf->xdp.data - mxbuf->xdp.data_hard_start,
+ mxbuf->xdp.data_end - mxbuf->xdp.data,
+ mxbuf->xdp.data - mxbuf->xdp.data_meta);
if (unlikely(!skb))
return NULL;
skb_mark_for_recycle(skb);
head_wi->frag_page->frags++;
- if (xdp_buff_has_frags(&mxbuf.xdp)) {
+ if (xdp_buff_has_frags(&mxbuf->xdp)) {
/* sinfo->nr_frags is reset by build_skb, calculate again. */
xdp_update_skb_shared_info(skb, wi - head_wi - 1,
sinfo->xdp_frags_size, truesize,
- xdp_buff_is_frag_pfmemalloc(&mxbuf.xdp));
+ xdp_buff_is_frag_pfmemalloc(&mxbuf->xdp));
for (struct mlx5e_wqe_frag_info *pwi = head_wi + 1; pwi < wi; pwi++)
pwi->frag_page->frags++;
@@ -1811,6 +1810,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
+ struct mlx5e_xdp_buff mxbuf;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 ci;
@@ -1828,7 +1828,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
mlx5e_xsk_skb_from_cqe_linear,
- rq, wi, cqe, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt, &mxbuf);
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@@ -1859,6 +1859,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
struct mlx5_eswitch_rep *rep = rpriv->rep;
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
+ struct mlx5e_xdp_buff mxbuf;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 ci;
@@ -1875,7 +1876,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe,
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
- rq, wi, cqe, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt, &mxbuf);
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@@ -1903,6 +1904,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz;
u32 head_offset = wqe_offset & ((1 << rq->mpwqe.page_shift) - 1);
u32 page_idx = wqe_offset >> rq->mpwqe.page_shift;
+ struct mlx5e_xdp_buff mxbuf;
struct mlx5e_rx_wqe_ll *wqe;
struct mlx5_wq_ll *wq;
struct sk_buff *skb;
@@ -1928,7 +1930,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq,
mlx5e_skb_from_cqe_mpwrq_linear,
mlx5e_skb_from_cqe_mpwrq_nonlinear,
- rq, wi, cqe, cqe_bcnt, head_offset, page_idx);
+ rq, wi, cqe, cqe_bcnt, head_offset, page_idx, &mxbuf);
if (!skb)
goto mpwrq_cqe_out;
@@ -1975,7 +1977,7 @@ mlx5e_shampo_fill_skb_data(struct sk_buff *skb, struct mlx5e_rq *rq,
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
- u32 page_idx)
+ u32 page_idx, struct mlx5e_xdp_buff *mxbuf)
{
struct mlx5e_frag_page *frag_page = &wi->alloc_units.frag_pages[page_idx];
u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt);
@@ -1983,7 +1985,6 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
u32 frag_offset = head_offset;
u32 byte_cnt = cqe_bcnt;
struct skb_shared_info *sinfo;
- struct mlx5e_xdp_buff mxbuf;
unsigned int truesize = 0;
struct bpf_prog *prog;
struct sk_buff *skb;
@@ -2029,9 +2030,9 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
}
}
- mlx5e_fill_mxbuf(rq, cqe, va, linear_hr, linear_frame_sz, linear_data_len, &mxbuf);
+ mlx5e_fill_mxbuf(rq, cqe, va, linear_hr, linear_frame_sz, linear_data_len, mxbuf);
- sinfo = xdp_get_shared_info_from_buff(&mxbuf.xdp);
+ sinfo = xdp_get_shared_info_from_buff(&mxbuf->xdp);
while (byte_cnt) {
/* Non-linear mode, hence non-XSK, which always uses PAGE_SIZE. */
@@ -2042,7 +2043,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
else
truesize += ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz));
- mlx5e_add_skb_shared_info_frag(rq, sinfo, &mxbuf.xdp, frag_page, frag_offset,
+ mlx5e_add_skb_shared_info_frag(rq, sinfo, &mxbuf->xdp, frag_page, frag_offset,
pg_consumed_bytes);
byte_cnt -= pg_consumed_bytes;
frag_offset = 0;
@@ -2050,7 +2051,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
}
if (prog) {
- if (mlx5e_xdp_handle(rq, prog, &mxbuf)) {
+ if (mlx5e_xdp_handle(rq, prog, mxbuf)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
struct mlx5e_frag_page *pfp;
@@ -2063,10 +2064,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
return NULL; /* page/packet was consumed by XDP */
}
- skb = mlx5e_build_linear_skb(rq, mxbuf.xdp.data_hard_start,
+ skb = mlx5e_build_linear_skb(rq, mxbuf->xdp.data_hard_start,
linear_frame_sz,
- mxbuf.xdp.data - mxbuf.xdp.data_hard_start, 0,
- mxbuf.xdp.data - mxbuf.xdp.data_meta);
+ mxbuf->xdp.data - mxbuf->xdp.data_hard_start, 0,
+ mxbuf->xdp.data - mxbuf->xdp.data_meta);
if (unlikely(!skb)) {
mlx5e_page_release_fragmented(rq, &wi->linear_page);
return NULL;
@@ -2076,13 +2077,13 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
wi->linear_page.frags++;
mlx5e_page_release_fragmented(rq, &wi->linear_page);
- if (xdp_buff_has_frags(&mxbuf.xdp)) {
+ if (xdp_buff_has_frags(&mxbuf->xdp)) {
struct mlx5e_frag_page *pagep;
/* sinfo->nr_frags is reset by build_skb, calculate again. */
xdp_update_skb_shared_info(skb, frag_page - head_page,
sinfo->xdp_frags_size, truesize,
- xdp_buff_is_frag_pfmemalloc(&mxbuf.xdp));
+ xdp_buff_is_frag_pfmemalloc(&mxbuf->xdp));
pagep = head_page;
do
@@ -2093,12 +2094,12 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
} else {
dma_addr_t addr;
- if (xdp_buff_has_frags(&mxbuf.xdp)) {
+ if (xdp_buff_has_frags(&mxbuf->xdp)) {
struct mlx5e_frag_page *pagep;
xdp_update_skb_shared_info(skb, sinfo->nr_frags,
sinfo->xdp_frags_size, truesize,
- xdp_buff_is_frag_pfmemalloc(&mxbuf.xdp));
+ xdp_buff_is_frag_pfmemalloc(&mxbuf->xdp));
pagep = frag_page - sinfo->nr_frags;
do
@@ -2120,7 +2121,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
- u32 page_idx)
+ u32 page_idx, struct mlx5e_xdp_buff *mxbuf)
{
struct mlx5e_frag_page *frag_page = &wi->alloc_units.frag_pages[page_idx];
u16 rx_headroom = rq->buff.headroom;
@@ -2148,20 +2149,19 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
- struct mlx5e_xdp_buff mxbuf;
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, rq->buff.frame0_sz,
- cqe_bcnt, &mxbuf);
- if (mlx5e_xdp_handle(rq, prog, &mxbuf)) {
+ cqe_bcnt, mxbuf);
+ if (mlx5e_xdp_handle(rq, prog, mxbuf)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
frag_page->frags++;
return NULL; /* page/packet was consumed by XDP */
}
- rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start;
- metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta;
- cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data;
+ rx_headroom = mxbuf->xdp.data - mxbuf->xdp.data_hard_start;
+ metasize = mxbuf->xdp.data - mxbuf->xdp.data_meta;
+ cqe_bcnt = mxbuf->xdp.data_end - mxbuf->xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
@@ -2283,12 +2283,14 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
bool flush = cqe->shampo.flush;
bool match = cqe->shampo.match;
struct mlx5e_rq_stats *stats = rq->stats;
+ struct mlx5e_xdp_buff mxbuf;
struct mlx5e_rx_wqe_ll *wqe;
struct mlx5e_mpw_info *wi;
struct mlx5_wq_ll *wq;
wi = mlx5e_get_mpw_info(rq, wqe_id);
wi->consumed_strides += cstrides;
+ mxbuf.xdp.flags = 0;
if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
mlx5e_handle_rx_err_cqe(rq, cqe);
@@ -2311,7 +2313,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
*skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index);
else
*skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, cqe_bcnt,
- data_offset, page_idx);
+ data_offset, page_idx, &mxbuf);
if (unlikely(!*skb))
goto free_hd_entry;
@@ -2369,6 +2371,7 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz;
u32 head_offset = wqe_offset & ((1 << rq->mpwqe.page_shift) - 1);
u32 page_idx = wqe_offset >> rq->mpwqe.page_shift;
+ struct mlx5e_xdp_buff mxbuf;
struct mlx5e_rx_wqe_ll *wqe;
struct mlx5_wq_ll *wq;
struct sk_buff *skb;
@@ -2396,7 +2399,7 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
mlx5e_skb_from_cqe_mpwrq_nonlinear,
mlx5e_xsk_skb_from_cqe_mpwrq_linear,
rq, wi, cqe, cqe_bcnt, head_offset,
- page_idx);
+ page_idx, &mxbuf);
if (!skb)
goto mpwrq_cqe_out;
@@ -2624,6 +2627,7 @@ static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
+ struct mlx5e_xdp_buff mxbuf;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 ci;
@@ -2640,7 +2644,7 @@ static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe,
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
- rq, wi, cqe, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt, &mxbuf);
if (!skb)
goto wq_cyc_pop;
@@ -2714,6 +2718,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
struct mlx5e_wqe_frag_info *wi;
+ struct mlx5e_xdp_buff mxbuf;
struct sk_buff *skb;
u32 cqe_bcnt;
u16 trap_id;
@@ -2729,7 +2734,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
goto wq_cyc_pop;
}
- skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe, cqe_bcnt);
+ skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe, cqe_bcnt, &mxbuf);
if (!skb)
goto wq_cyc_pop;
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* [RFC net-next 8/9] mlx5: apply XDP offloading fixup when building skb
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (6 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 7/9] mlx5: move xdp_buff scope one level up Jesper Dangaard Brouer
@ 2024-06-20 22:19 ` Yan Zhai
2024-06-20 22:19 ` [RFC net-next 9/9] bpf: selftests: test disabling GRO by XDP Yan Zhai
8 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Alexander Lobakin, Yan Zhai, Dragos Tatulea, netdev, linux-rdma,
linux-kernel, bpf
Add a common point to transfer offloading info from XDP context to skb.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
.../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 8 ++++++--
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 14 ++++++++++++++
2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 4dacaa61e106..9bf49ff2e0dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -250,7 +250,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
u16 cqe_bcnt,
u32 head_offset,
u32 page_idx,
- struct mlx5e_xdp_buff *mxbuf_)
+ struct mlx5e_xdp_buff *mxbuf_caller)
{
struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units.xsk_buffs[page_idx]);
struct bpf_prog *prog;
@@ -270,6 +270,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
mxbuf->cqe = cqe;
+ xdp_init_buff_minimal(&mxbuf->xdp);
xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
net_prefetch(mxbuf->xdp.data);
@@ -295,6 +296,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
__set_bit(page_idx, wi->skip_release_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
}
+ mxbuf_caller->xdp.flags = mxbuf->xdp.flags;
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
@@ -306,7 +308,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
struct mlx5_cqe64 *cqe,
u32 cqe_bcnt,
- struct mlx5e_xdp_buff *mxbuf_)
+ struct mlx5e_xdp_buff *mxbuf_caller)
{
struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(*wi->xskp);
struct bpf_prog *prog;
@@ -320,6 +322,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
mxbuf->cqe = cqe;
+ xdp_init_buff_minimal(&mxbuf->xdp);
xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
net_prefetch(mxbuf->xdp.data);
@@ -330,6 +333,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
wi->flags |= BIT(MLX5E_WQE_FRAG_SKIP_RELEASE);
return NULL; /* page/packet was consumed by XDP */
}
+ mxbuf_caller->xdp.flags = mxbuf->xdp.flags;
/* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
* will be handled by mlx5e_free_rx_wqe.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1a592a1ab988..0a47889e281e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1670,6 +1670,8 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
dma_addr_t addr;
u32 frag_size;
+ xdp_init_buff_minimal(&mxbuf->xdp);
+
va = page_address(frag_page->page) + wi->offset;
data = va + rx_headroom;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
@@ -1721,6 +1723,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
void *va;
frag_page = wi->frag_page;
+ xdp_init_buff_minimal(&mxbuf->xdp);
va = page_address(frag_page->page) + wi->offset;
frag_consumed_bytes = min_t(u32, frag_info->frag_size, cqe_bcnt);
@@ -1837,6 +1840,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
}
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
if (mlx5e_cqe_regb_chain(cqe))
if (!mlx5e_tc_update_skb_nic(cqe, skb)) {
@@ -1885,6 +1889,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
}
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
if (rep->vlan && skb_vlan_tag_present(skb))
skb_vlan_pop(skb);
@@ -1935,6 +1940,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
goto mpwrq_cqe_out;
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
mlx5e_rep_tc_receive(cqe, rq, skb);
@@ -2138,6 +2144,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
return NULL;
}
+ xdp_init_buff_minimal(&mxbuf->xdp);
+
va = page_address(frag_page->page) + head_offset;
data = va + rx_headroom;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
@@ -2345,6 +2353,8 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
}
mlx5e_shampo_complete_rx_cqe(rq, cqe, cqe_bcnt, *skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, *skb);
+
if (flush && rq->hw_gro_data->skb)
mlx5e_shampo_flush_skb(rq, cqe, match);
free_hd_entry:
@@ -2404,6 +2414,7 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
goto mpwrq_cqe_out;
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
if (mlx5e_cqe_regb_chain(cqe))
if (!mlx5e_tc_update_skb_nic(cqe, skb)) {
@@ -2649,6 +2660,8 @@ static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
goto wq_cyc_pop;
mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
+
if (unlikely(!skb->dev)) {
dev_kfree_skb_any(skb);
goto wq_cyc_pop;
@@ -2740,6 +2753,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
skb_push(skb, ETH_HLEN);
+ xdp_buff_fixup_skb_offloading(&mxbuf.xdp, skb);
mlx5_devlink_trap_report(rq->mdev, trap_id, skb,
rq->netdev->devlink_port);
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread* [RFC net-next 9/9] bpf: selftests: test disabling GRO by XDP
2024-06-20 22:19 [RFC net-next 0/9] xdp: allow disable GRO per packet by XDP Yan Zhai
` (7 preceding siblings ...)
2024-06-20 22:19 ` [RFC net-next 8/9] mlx5: apply XDP offloading fixup when building skb Yan Zhai
@ 2024-06-20 22:19 ` Yan Zhai
8 siblings, 0 replies; 33+ messages in thread
From: Yan Zhai @ 2024-06-20 22:19 UTC (permalink / raw)
To: netdev
Cc: Andrii Nakryiko, Eduard Zingerman, Mykola Lysenko,
Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
Hao Luo, Jiri Olsa, Shuah Khan, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, linux-kernel, bpf, linux-kselftest,
netdev
Test the case when XDP disables GRO for a packet, the effect is actually
reflected on the receiving side.
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
tools/testing/selftests/bpf/config | 1 +
.../selftests/bpf/prog_tests/xdp_offloading.c | 122 ++++++++++++++++++
.../selftests/bpf/progs/xdp_offloading.c | 50 +++++++
3 files changed, 173 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_offloading.c
create mode 100644 tools/testing/selftests/bpf/progs/xdp_offloading.c
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 2fb16da78dce..e789392f44bd 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -96,3 +96,4 @@ CONFIG_XDP_SOCKETS=y
CONFIG_XFRM_INTERFACE=y
CONFIG_TCP_CONG_DCTCP=y
CONFIG_TCP_CONG_BBR=y
+CONFIG_SKB_GRO_CONTROL=y
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_offloading.c b/tools/testing/selftests/bpf/prog_tests/xdp_offloading.c
new file mode 100644
index 000000000000..462296d9689a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_offloading.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_offloading.skel.h"
+
+/* run tcp server in ns1, client in ns2, and transmit 10MB data */
+static void run_tcp_test(const char *server_ip)
+{
+ struct nstoken *ns1 = NULL, *ns2 = NULL;
+ struct sockaddr_storage server_addr;
+ int total_bytes = 10 * 1024 * 1024;
+ int server_fd = -1, client_fd = -1;
+ int server_port = 5555;
+
+ socklen_t addrlen = sizeof(server_addr);
+
+ if (!ASSERT_OK(make_sockaddr(AF_INET, server_ip, server_port,
+ &server_addr, &addrlen), "make_addr"))
+ goto err;
+
+ ns1 = open_netns("ns1");
+ if (!ASSERT_OK_PTR(ns1, "setns ns1"))
+ goto err;
+
+ server_fd = start_server_str(AF_INET, SOCK_STREAM, "0.0.0.0",
+ server_port, NULL);
+ if (!ASSERT_NEQ(server_fd, -1, "start_server_str"))
+ goto err;
+
+ ns2 = open_netns("ns2");
+ if (!ASSERT_OK_PTR(ns2, "setns ns2"))
+ goto err;
+
+ client_fd = connect_to_addr(SOCK_STREAM, &server_addr, addrlen, NULL);
+ if (!ASSERT_NEQ(client_fd, -1, "connect_to_addr"))
+ goto err;
+
+ /* send 10MB data */
+ if (!ASSERT_OK(send_recv_data(server_fd, client_fd, total_bytes),
+ "send_recv_data"))
+ goto err;
+
+err:
+ if (server_fd != -1)
+ close(server_fd);
+ if (client_fd != -1)
+ close(client_fd);
+ if (ns1)
+ close_netns(ns1);
+ if (ns2)
+ close_netns(ns2);
+}
+
+/* This test involves two netns:
+ * NS1 | NS2
+ * |
+ * ----> veth1 --> veth_offloading(xdp)-->(tp:netif_receive_skb)
+ * | | |
+ * | | v
+ * tcp-server | tcp-client
+ *
+ * a TCP server in NS1 sends data through veth1, and the XDP program on
+ * "xdp_offloading" is what we test against. This XDP program will apply
+ * offloadings and we examine these at netif_receive_skb tracepoint if the
+ * offloadings are propagated to skbs.
+ */
+void test_xdp_offloading(void)
+{
+ const char *xdp_ifname = "veth_offloading";
+ struct nstoken *nstoken = NULL;
+ struct xdp_offloading *skel = NULL;
+ struct bpf_link *link_xdp, *link_tp;
+ const char *server_ip = "192.168.0.2";
+ const char *client_ip = "192.168.0.3";
+ int ifindex;
+
+ SYS(out, "ip netns add ns1");
+ SYS(out, "ip netns add ns2");
+ SYS(out, "ip -n ns1 link add veth1 type veth peer name %s netns ns2",
+ xdp_ifname);
+ SYS(out, "ip -n ns1 link set veth1 up");
+ SYS(out, "ip -n ns2 link set veth_offloading up");
+ SYS(out, "ip -n ns1 addr add dev veth1 %s/31", server_ip);
+ SYS(out, "ip -n ns2 addr add dev %s %s/31", xdp_ifname, client_ip);
+
+ SYS(out, "ip netns exec ns2 ethtool -K %s gro on", xdp_ifname);
+
+ nstoken = open_netns("ns2");
+ if (!ASSERT_OK_PTR(nstoken, "setns"))
+ goto out;
+
+ skel = xdp_offloading__open();
+ if (!ASSERT_OK_PTR(skel, "skel"))
+ return;
+
+ ifindex = if_nametoindex(xdp_ifname);
+ if (!ASSERT_NEQ(ifindex, 0, "ifindex"))
+ goto out;
+
+ memcpy(skel->rodata->target_ifname, xdp_ifname, IFNAMSIZ);
+
+ if (!ASSERT_OK(xdp_offloading__load(skel), "load"))
+ goto out;
+
+ link_xdp = bpf_program__attach_xdp(skel->progs.xdp_disable_gro, ifindex);
+ if (!ASSERT_OK_PTR(link_xdp, "xdp_attach"))
+ goto out;
+
+ link_tp = bpf_program__attach(skel->progs.observe_skb_gro_disabled);
+ if (!ASSERT_OK_PTR(link_tp, "xdp_attach"))
+ goto out;
+
+ run_tcp_test(server_ip);
+
+ ASSERT_NEQ(__sync_fetch_and_add(&skel->bss->invalid_skb, 0), 0,
+ "check invalid skbs");
+out:
+ if (nstoken)
+ close_netns(nstoken);
+ SYS_NOFAIL("ip netns del ns1");
+ SYS_NOFAIL("ip netns del ns2");
+}
diff --git a/tools/testing/selftests/bpf/progs/xdp_offloading.c b/tools/testing/selftests/bpf/progs/xdp_offloading.c
new file mode 100644
index 000000000000..5fd88d75b008
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_offloading.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define IFNAMSIZ 16
+
+/* using a special ifname to filter unrelated traffic */
+const __u8 target_ifname[IFNAMSIZ];
+
+/* test outputs: these counters should be 0 to pass tests */
+int64_t invalid_skb = 0;
+
+extern int bpf_xdp_disable_gro(struct xdp_md *xdp) __ksym;
+
+/*
+ * Observing: after XDP disables GRO, gro_disabled bit should be set
+ * and gso_size should be 0.
+ */
+SEC("tp_btf/netif_receive_skb")
+int BPF_PROG(observe_skb_gro_disabled, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo =
+ (struct skb_shared_info *)(skb->head + skb->end);
+ char devname[IFNAMSIZ];
+ int gso_size;
+
+ __builtin_memcpy(devname, skb->dev->name, IFNAMSIZ);
+ if (bpf_strncmp(devname, IFNAMSIZ, (const char *)target_ifname))
+ return 0;
+
+ if (!skb->gro_disabled)
+ __sync_fetch_and_add(&invalid_skb, 1);
+
+ gso_size = BPF_CORE_READ(shinfo, gso_size);
+ if (gso_size)
+ __sync_fetch_and_add(&invalid_skb, 1);
+
+ return 0;
+}
+
+SEC("xdp")
+int xdp_disable_gro(struct xdp_md *xdp)
+{
+ bpf_xdp_disable_gro(xdp);
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.30.2
^ permalink raw reply related [flat|nested] 33+ messages in thread