netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] vxlan: add support for flowlabel inherit
@ 2023-09-30 14:28 Alce Lafranque
  2023-09-30 15:29 ` Eric Dumazet
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Alce Lafranque @ 2023-09-30 14:28 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, Ido Schimmel, netdev
  Cc: vincent, alce

By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an
option for a fixed value. This commits add the ability to inherit the flow
label from the inner packet, like for other tunnel implementations.

```
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
    0110 .... = Version: 6
    .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
        .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
        .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
    .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
    Flags: 0x0800, VXLAN Network ID (VNI)
    Group Policy ID: 0
    VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
    0110 .... = Version: 6
    .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
        .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
        .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
    .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```

Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
---
 drivers/net/vxlan/vxlan_core.c | 20 ++++++++++++++++++--
 include/net/ip_tunnels.h       | 11 +++++++++++
 include/net/vxlan.h            |  2 ++
 include/uapi/linux/if_link.h   |  1 +
 4 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 5b5597073b00..aa7fbfdd93b1 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2475,7 +2475,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		else
 			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
 #if IS_ENABLED(CONFIG_IPV6)
-		label = vxlan->cfg.label;
+		if (flags & VXLAN_F_LABEL_INHERIT) {
+			label = ip_tunnel_get_flowlabel(old_iph, skb);
+		} else {
+			label = vxlan->cfg.label;
+		}
 #endif
 	} else {
 		if (!info) {
@@ -3286,6 +3290,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_LOCALBYPASS]	= NLA_POLICY_MAX(NLA_U8, 1),
+	[IFLA_VXLAN_LABEL_INHERIT]	= { .type = NLA_FLAG },
 };

 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -4001,7 +4006,15 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],

 	if (data[IFLA_VXLAN_LABEL])
 		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
-			     IPV6_FLOWLABEL_MASK;
+			      IPV6_FLOWLABEL_MASK;
+
+	if (data[IFLA_VXLAN_LABEL_INHERIT]) {
+		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LABEL_INHERIT,
+				    VXLAN_F_LABEL_INHERIT, changelink, false,
+				    extack);
+		if (err)
+			return err;
+	}

 	if (data[IFLA_VXLAN_LEARNING]) {
 		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
@@ -4315,6 +4328,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LABEL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
@@ -4387,6 +4401,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
 	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LABEL_INHERIT,
+		       !!(vxlan->cfg.flags & VXLAN_F_LABEL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index f346b4efbc30..2d746f4c9a0a 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 		return 0;
 }

+static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
+					     const struct sk_buff *skb)
+{
+	__be16 payload_protocol = skb_protocol(skb, true);
+
+	if (payload_protocol == htons(ETH_P_IPV6))
+		return ip6_flowlabel((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
 static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
 				       const struct sk_buff *skb)
 {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6a9f8a5f387c..f82ce013c8ff 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -329,6 +329,7 @@ struct vxlan_dev {
 #define VXLAN_F_VNIFILTER               0x20000
 #define VXLAN_F_MDB			0x40000
 #define VXLAN_F_LOCALBYPASS		0x80000
+#define VXLAN_F_LABEL_INHERIT		0x100000

 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -534,6 +535,7 @@ static inline void vxlan_flag_attr_error(int attrtype,
 		break
 	switch (attrtype) {
 	VXLAN_FLAG(TTL_INHERIT);
+	VXLAN_FLAG(LABEL_INHERIT);
 	VXLAN_FLAG(LEARNING);
 	VXLAN_FLAG(PROXY);
 	VXLAN_FLAG(RSC);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index fac351a93aed..bd69af34feba 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -830,6 +830,7 @@ enum {
 	IFLA_VXLAN_DF,
 	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
 	IFLA_VXLAN_LOCALBYPASS,
+	IFLA_VXLAN_LABEL_INHERIT,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
--
2.39.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-09-30 14:28 [PATCH net-next] vxlan: add support for flowlabel inherit Alce Lafranque
@ 2023-09-30 15:29 ` Eric Dumazet
  2023-09-30 18:16   ` alce
  2023-10-01 21:21   ` Vincent Bernat
  2023-10-03 10:59 ` Ido Schimmel
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2023-09-30 15:29 UTC (permalink / raw)
  To: Alce Lafranque
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, David Ahern,
	Ido Schimmel, netdev, vincent

On Sat, Sep 30, 2023 at 5:13 PM Alce Lafranque <alce@lafranque.net> wrote:
>
> By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an
> option for a fixed value. This commits add the ability to inherit the flow
> label from the inner packet, like for other tunnel implementations.
>
> ```
> $ ./ip/ip addr add 2001:db8::2/64 dev dummy1
> $ ./ip/ip link set up dev dummy1
> $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2

Side question : How can "flowlabel inherit" can be turned off later
with an "ip link change ..." ?

It seems vxlan_nl2flag() would always turn it 'on' for NLA_FLAG type :

if (vxlan_policy[attrtype].type == NLA_FLAG)
    flags = conf->flags | mask;  // always turn on
else if (nla_get_u8(tb[attrtype]))    // dead code for NLA_FLAG
    flags = conf->flags | mask;
else
    flags = conf->flags & ~mask;

conf->flags = flags;


> $ ./ip/ip link set up dev vxlan1
> $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
> $ ./ip/ip link set arp off dev vxlan1
> $ ping -q 2001:db8:1::1 &
> $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
> [...]
> Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> [...]
> Virtual eXtensible Local Area Network
>     Flags: 0x0800, VXLAN Network ID (VNI)
>     Group Policy ID: 0
>     VXLAN Network Identifier (VNI): 100
> [...]
> Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> ```
>
> Signed-off-by: Alce Lafranque <alce@lafranque.net>
> Co-developed-by: Vincent Bernat <vincent@bernat.ch>
> Signed-off-by: Vincent Bernat <vincent@bernat.ch>
> ---
>  drivers/net/vxlan/vxlan_core.c | 20 ++++++++++++++++++--
>  include/net/ip_tunnels.h       | 11 +++++++++++
>  include/net/vxlan.h            |  2 ++
>  include/uapi/linux/if_link.h   |  1 +
>  4 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
> index 5b5597073b00..aa7fbfdd93b1 100644
> --- a/drivers/net/vxlan/vxlan_core.c
> +++ b/drivers/net/vxlan/vxlan_core.c
> @@ -2475,7 +2475,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>                 else
>                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
>  #if IS_ENABLED(CONFIG_IPV6)
> -               label = vxlan->cfg.label;
> +               if (flags & VXLAN_F_LABEL_INHERIT) {
> +                       label = ip_tunnel_get_flowlabel(old_iph, skb);
> +               } else {
> +                       label = vxlan->cfg.label;
> +               }

You can remove the braces.

>  #endif
>         } else {
>                 if (!info) {
> @@ -3286,6 +3290,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>         [IFLA_VXLAN_DF]         = { .type = NLA_U8 },
>         [IFLA_VXLAN_VNIFILTER]  = { .type = NLA_U8 },
>         [IFLA_VXLAN_LOCALBYPASS]        = NLA_POLICY_MAX(NLA_U8, 1),
> +       [IFLA_VXLAN_LABEL_INHERIT]      = { .type = NLA_FLAG },
>  };
>
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
> @@ -4001,7 +4006,15 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
>
>         if (data[IFLA_VXLAN_LABEL])
>                 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
> -                            IPV6_FLOWLABEL_MASK;
> +                             IPV6_FLOWLABEL_MASK;
> +
> +       if (data[IFLA_VXLAN_LABEL_INHERIT]) {
> +               err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LABEL_INHERIT,
> +                                   VXLAN_F_LABEL_INHERIT, changelink, false,
> +                                   extack);
> +               if (err)
> +                       return err;
> +       }
>
>         if (data[IFLA_VXLAN_LEARNING]) {
>                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
> @@ -4315,6 +4328,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_DF */
>                 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> +               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LABEL_INHERIT */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
> @@ -4387,6 +4401,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
>             nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
>             nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> +           nla_put_u8(skb, IFLA_VXLAN_LABEL_INHERIT,
> +                      !!(vxlan->cfg.flags & VXLAN_F_LABEL_INHERIT)) ||

This seems in contradiction with NLA_FLAG semantics if the flag can
not be turned off.

Look for nla_put_flag(). User space could get confused.

>             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
>                        !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
>             nla_put_u8(skb, IFLA_VXLAN_PROXY,
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-09-30 15:29 ` Eric Dumazet
@ 2023-09-30 18:16   ` alce
  2023-10-01 21:21   ` Vincent Bernat
  1 sibling, 0 replies; 10+ messages in thread
From: alce @ 2023-09-30 18:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, David Ahern,
	Ido Schimmel, netdev, vincent

Le 2023-09-30 17:29, Eric Dumazet a écrit :
> Side question : How can "flowlabel inherit" can be turned off later
> with an "ip link change ..." ?
> 
> It seems vxlan_nl2flag() would always turn it 'on' for NLA_FLAG type :
> 
> if (vxlan_policy[attrtype].type == NLA_FLAG)
>     flags = conf->flags | mask;  // always turn on
> else if (nla_get_u8(tb[attrtype]))    // dead code for NLA_FLAG
>     flags = conf->flags | mask;
> else
>     flags = conf->flags & ~mask;
> 
> conf->flags = flags;

The "flow label inherit" cannot be turned off after the
interface has been created, we have followed the logic used by 
TTL_INHERIT.

If the behavior isn't meant to be, we can try to modify it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-09-30 15:29 ` Eric Dumazet
  2023-09-30 18:16   ` alce
@ 2023-10-01 21:21   ` Vincent Bernat
  1 sibling, 0 replies; 10+ messages in thread
From: Vincent Bernat @ 2023-10-01 21:21 UTC (permalink / raw)
  To: Eric Dumazet, Alce Lafranque
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, David Ahern,
	Ido Schimmel, netdev

On 2023-09-30 17:29, Eric Dumazet wrote:

>> $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
> 
> Side question : How can "flowlabel inherit" can be turned off later
> with an "ip link change ..." ?
> 
> It seems vxlan_nl2flag() would always turn it 'on' for NLA_FLAG type :
> 
> if (vxlan_policy[attrtype].type == NLA_FLAG)
>      flags = conf->flags | mask;  // always turn on
> else if (nla_get_u8(tb[attrtype]))    // dead code for NLA_FLAG
>      flags = conf->flags | mask;
> else
>      flags = conf->flags & ~mask;
> 
> conf->flags = flags;

Most "flags" in vxlan module cannot be changed (see 
vxlan_flag_attr_error()).

IFLA_VXLAN_TTL_INHERIT seems to be the only one using NLA_FLAG. All the 
others are using NLA_U8 and in iproute2, they use XXXX and noXXXX option 
style. I suppose it makes sense to do this way if you don't know what 
the default value is.

For IFLA_VXLAN_TTL_INHERIT, iproute2 treat it as a u8 when reading, but 
as a flag when writing. I don't know if it makes sense to turn it to a 
true flag or if it would be considered as breaking userspace? iproute2 
would be OK, but I suppose that another piece of userland could have put 
the attribute with a value of 0.

For IFLA_VXLAN_LABEL_INHERIT, we can use a proper flag from start.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-09-30 14:28 [PATCH net-next] vxlan: add support for flowlabel inherit Alce Lafranque
  2023-09-30 15:29 ` Eric Dumazet
@ 2023-10-03 10:59 ` Ido Schimmel
  2023-10-07 14:46   ` alce
  2023-10-07 14:26 ` [PATCH net-next v2] " Alce Lafranque
  2023-10-07 17:09 ` [PATCH net-next] " Tom Herbert
  3 siblings, 1 reply; 10+ messages in thread
From: Ido Schimmel @ 2023-10-03 10:59 UTC (permalink / raw)
  To: Alce Lafranque
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, netdev, vincent

On Sat, Sep 30, 2023 at 09:28:19AM -0500, Alce Lafranque wrote:
> By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an
> option for a fixed value. This commits add the ability to inherit the flow
> label from the inner packet, like for other tunnel implementations.

Please add motivation to the commit message. At first glance it's not
clear why this is needed given that the entropy is encoded in the UDP
source port (unlike ip6gre, for example). I assume the motivation is
related to devices that can't calculate ECMP/RSS hash based on L4
headers, but I prefer not to guess :)

Regarding the netlink interface, I think we should introduce a new u8
attribute that encodes the flow label policy:

0 - Fixed. Taken from IFLA_VXLAN_LABEL. This is the default.
1 - Inherit. Copy if packet is IPv6, otherwise set to 0.

This will allow us to add more policies in the future:

2 - Hash. Set the flow label according to the hash of the inner packet.
Useful when inner packet is not IPv6.
3 - Inherit with hash fallback. Copy if packet is IPv6, otherwise set
according to the hash of the inner packet.

This already exists in at least one hardware implementation that I'm
familiar with, so I see no reason not to create provisions for this in a
software implementation. We can implement the last two policies if/when
someone asks for them.

The command line interface can remain as you have it now:

 # ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2

But unlike the current implementation, user space will be able to change
it on the fly:

 # ip link set vxlan1 type vxlan flowlabel 5
 # ip link set vxlan1 type vxlan flowlabel inherit

Unless iproute2 is taught to probe for the new policy attribute, then
when setting the flow label to a fixed value iproute2 shouldn't default
to sending the new attribute as it will be rejected by old kernels.
Instead, the kernel can be taught that the presence of IFLA_VXLAN_LABEL
implies the default policy.

The JSON output can mimic the netlink structure and add a new key for
the new policy attribute.

Thanks

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next v2] vxlan: add support for flowlabel inherit
  2023-09-30 14:28 [PATCH net-next] vxlan: add support for flowlabel inherit Alce Lafranque
  2023-09-30 15:29 ` Eric Dumazet
  2023-10-03 10:59 ` Ido Schimmel
@ 2023-10-07 14:26 ` Alce Lafranque
  2023-10-07 15:44   ` kernel test robot
  2023-10-11  7:11   ` Ido Schimmel
  2023-10-07 17:09 ` [PATCH net-next] " Tom Herbert
  3 siblings, 2 replies; 10+ messages in thread
From: Alce Lafranque @ 2023-10-07 14:26 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, Ido Schimmel, netdev
  Cc: Alce Lafranque, Vincent Bernat

By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
an option for a fixed value. This commits add the ability to inherit the
flow label from the inner packet, like for other tunnel implementations.
This enables devices using only L3 headers for ECMP to correctly balance
VXLAN-encapsulated IPv6 packets.

```
$ ./ip/ip link add dummy1 type dummy
$ ./ip/ip addr add 2001:db8::2/64 dev dummy1
$ ./ip/ip link set up dev dummy1
$ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
$ ./ip/ip link set up dev vxlan1
$ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
$ ./ip/ip link set arp off dev vxlan1
$ ping -q 2001:db8:1::1 &
$ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
[...]
Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
    0110 .... = Version: 6
    .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
        .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
        .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
    .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
[...]
Virtual eXtensible Local Area Network
    Flags: 0x0800, VXLAN Network ID (VNI)
    Group Policy ID: 0
    VXLAN Network Identifier (VNI): 100
[...]
Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
    0110 .... = Version: 6
    .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
        .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
        .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
    .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
```

Signed-off-by: Alce Lafranque <alce@lafranque.net>
Co-developed-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Vincent Bernat <vincent@bernat.ch>
---
 drivers/net/vxlan/vxlan_core.c | 15 ++++++++++++++-
 include/net/ip_tunnels.h       | 11 +++++++++++
 include/net/vxlan.h            | 33 +++++++++++++++++----------------
 include/uapi/linux/if_link.h   |  8 ++++++++
 4 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 5b5597073b00..d1f2376c0c73 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2475,7 +2475,14 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		else
 			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
 #if IS_ENABLED(CONFIG_IPV6)
-		label = vxlan->cfg.label;
+		switch (vxlan->cfg.label_behavior) {
+		case VXLAN_LABEL_FIXED:
+			label = vxlan->cfg.label;
+			break;
+		case VXLAN_LABEL_INHERIT:
+			label = ip_tunnel_get_flowlabel(old_iph, skb);
+			break;
+		}
 #endif
 	} else {
 		if (!info) {
@@ -3286,6 +3293,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_LOCALBYPASS]	= NLA_POLICY_MAX(NLA_U8, 1),
+	[IFLA_VXLAN_LABEL_BEHAVIOR]	= NLA_POLICY_MAX(NLA_U8, VXLAN_LABEL_MAX),
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -4003,6 +4011,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
 			     IPV6_FLOWLABEL_MASK;
 
+	if (data[IFLA_VXLAN_LABEL_BEHAVIOR])
+		conf->label_behavior = nla_get_u8(data[IFLA_VXLAN_LABEL_BEHAVIOR]);
+
 	if (data[IFLA_VXLAN_LEARNING]) {
 		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
 				    VXLAN_F_LEARN, changelink, true,
@@ -4315,6 +4326,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
+		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LABEL_BEHAVIOR */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
@@ -4387,6 +4399,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
 	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LABEL_BEHAVIOR, vxlan->cfg.label_behavior) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index f346b4efbc30..2d746f4c9a0a 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 		return 0;
 }
 
+static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
+					     const struct sk_buff *skb)
+{
+	__be16 payload_protocol = skb_protocol(skb, true);
+
+	if (payload_protocol == htons(ETH_P_IPV6))
+		return ip6_flowlabel((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
 static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
 				       const struct sk_buff *skb)
 {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6a9f8a5f387c..9ccbc8b7b8f9 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -210,22 +210,23 @@ struct vxlan_rdst {
 };
 
 struct vxlan_config {
-	union vxlan_addr	remote_ip;
-	union vxlan_addr	saddr;
-	__be32			vni;
-	int			remote_ifindex;
-	int			mtu;
-	__be16			dst_port;
-	u16			port_min;
-	u16			port_max;
-	u8			tos;
-	u8			ttl;
-	__be32			label;
-	u32			flags;
-	unsigned long		age_interval;
-	unsigned int		addrmax;
-	bool			no_share;
-	enum ifla_vxlan_df	df;
+	union vxlan_addr		remote_ip;
+	union vxlan_addr		saddr;
+	__be32				vni;
+	int				remote_ifindex;
+	int				mtu;
+	__be16				dst_port;
+	u16				port_min;
+	u16				port_max;
+	u8				tos;
+	u8				ttl;
+	__be32				label;
+	enum ifla_vxlan_label_behavior	label_behavior;
+	u32				flags;
+	unsigned long			age_interval;
+	unsigned int			addrmax;
+	bool				no_share;
+	enum ifla_vxlan_df		df;
 };
 
 enum {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index fac351a93aed..13afc4afcc76 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -830,6 +830,7 @@ enum {
 	IFLA_VXLAN_DF,
 	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
 	IFLA_VXLAN_LOCALBYPASS,
+	IFLA_VXLAN_LABEL_BEHAVIOR,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -847,6 +848,13 @@ enum ifla_vxlan_df {
 	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
 };
 
+enum ifla_vxlan_label_behavior {
+	VXLAN_LABEL_FIXED = 0,
+	VXLAN_LABEL_INHERIT = 1,
+	__VXLAN_LABEL_END,
+	VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-10-03 10:59 ` Ido Schimmel
@ 2023-10-07 14:46   ` alce
  0 siblings, 0 replies; 10+ messages in thread
From: alce @ 2023-10-07 14:46 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, netdev, vincent

Le 2023-10-03 12:59, Ido Schimmel a écrit :
> Unless iproute2 is taught to probe for the new policy attribute, then
> when setting the flow label to a fixed value iproute2 shouldn't default
> to sending the new attribute as it will be rejected by old kernels.
> Instead, the kernel can be taught that the presence of IFLA_VXLAN_LABEL
> implies the default policy.


I have sent an updated version of the patch. It follows your proposition 
of using an enum to define the behavior of the flow label field. The 
kernel does not switch back the behavior to the default value when 
sending IFLA_VXLAN_LABEL attribute as we checked that older kernels do 
not reject the netlink message when it contains both IFLA_VXLAN_LABEL 
and IFLA_VXLAN_LABEL_BEHAVIOR, but we can change that if needed. Also, 
"ip link set" works and the behavior and label can be changed on an 
existing device.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v2] vxlan: add support for flowlabel inherit
  2023-10-07 14:26 ` [PATCH net-next v2] " Alce Lafranque
@ 2023-10-07 15:44   ` kernel test robot
  2023-10-11  7:11   ` Ido Schimmel
  1 sibling, 0 replies; 10+ messages in thread
From: kernel test robot @ 2023-10-07 15:44 UTC (permalink / raw)
  To: Alce Lafranque, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, David Ahern, Ido Schimmel
  Cc: oe-kbuild-all, netdev, Alce Lafranque, Vincent Bernat

Hi Alce,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Alce-Lafranque/vxlan-add-support-for-flowlabel-inherit/20231007-222907
base:   net-next/main
patch link:    https://lore.kernel.org/r/20231007142624.739192-1-alce%40lafranque.net
patch subject: [PATCH net-next v2] vxlan: add support for flowlabel inherit
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20231007/202310072308.H5ORfHTn-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231007/202310072308.H5ORfHTn-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310072308.H5ORfHTn-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/net/vxlan/vxlan_core.c: In function 'vxlan_xmit_one':
>> drivers/net/vxlan/vxlan_core.c:2478:17: warning: enumeration value '__VXLAN_LABEL_END' not handled in switch [-Wswitch]
    2478 |                 switch (vxlan->cfg.label_behavior) {
         |                 ^~~~~~


vim +/__VXLAN_LABEL_END +2478 drivers/net/vxlan/vxlan_core.c

  2440	
  2441		info = skb_tunnel_info(skb);
  2442	
  2443		if (rdst) {
  2444			dst = &rdst->remote_ip;
  2445			if (vxlan_addr_any(dst)) {
  2446				if (did_rsc) {
  2447					/* short-circuited back to local bridge */
  2448					vxlan_encap_bypass(skb, vxlan, vxlan,
  2449							   default_vni, true);
  2450					return;
  2451				}
  2452				goto drop;
  2453			}
  2454	
  2455			dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
  2456			vni = (rdst->remote_vni) ? : default_vni;
  2457			ifindex = rdst->remote_ifindex;
  2458			local_ip = vxlan->cfg.saddr;
  2459			dst_cache = &rdst->dst_cache;
  2460			md->gbp = skb->mark;
  2461			if (flags & VXLAN_F_TTL_INHERIT) {
  2462				ttl = ip_tunnel_get_ttl(old_iph, skb);
  2463			} else {
  2464				ttl = vxlan->cfg.ttl;
  2465				if (!ttl && vxlan_addr_multicast(dst))
  2466					ttl = 1;
  2467			}
  2468	
  2469			tos = vxlan->cfg.tos;
  2470			if (tos == 1)
  2471				tos = ip_tunnel_get_dsfield(old_iph, skb);
  2472	
  2473			if (dst->sa.sa_family == AF_INET)
  2474				udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
  2475			else
  2476				udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
  2477	#if IS_ENABLED(CONFIG_IPV6)
> 2478			switch (vxlan->cfg.label_behavior) {
  2479			case VXLAN_LABEL_FIXED:
  2480				label = vxlan->cfg.label;
  2481				break;
  2482			case VXLAN_LABEL_INHERIT:
  2483				label = ip_tunnel_get_flowlabel(old_iph, skb);
  2484				break;
  2485			}
  2486	#endif
  2487		} else {
  2488			if (!info) {
  2489				WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
  2490					  dev->name);
  2491				goto drop;
  2492			}
  2493			remote_ip.sa.sa_family = ip_tunnel_info_af(info);
  2494			if (remote_ip.sa.sa_family == AF_INET) {
  2495				remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
  2496				local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
  2497			} else {
  2498				remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
  2499				local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
  2500			}
  2501			dst = &remote_ip;
  2502			dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
  2503			flow_flags = info->key.flow_flags;
  2504			vni = tunnel_id_to_key32(info->key.tun_id);
  2505			ifindex = 0;
  2506			dst_cache = &info->dst_cache;
  2507			if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
  2508				if (info->options_len < sizeof(*md))
  2509					goto drop;
  2510				md = ip_tunnel_info_opts(info);
  2511			}
  2512			ttl = info->key.ttl;
  2513			tos = info->key.tos;
  2514	#if IS_ENABLED(CONFIG_IPV6)
  2515			label = info->key.label;
  2516	#endif
  2517			udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
  2518		}
  2519		src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
  2520					     vxlan->cfg.port_max, true);
  2521	
  2522		rcu_read_lock();
  2523		if (dst->sa.sa_family == AF_INET) {
  2524			struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
  2525			struct rtable *rt;
  2526			__be16 df = 0;
  2527	
  2528			if (!ifindex)
  2529				ifindex = sock4->sock->sk->sk_bound_dev_if;
  2530	
  2531			rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
  2532					     dst->sin.sin_addr.s_addr,
  2533					     &local_ip.sin.sin_addr.s_addr,
  2534					     dst_port, src_port, flow_flags,
  2535					     dst_cache, info);
  2536			if (IS_ERR(rt)) {
  2537				err = PTR_ERR(rt);
  2538				goto tx_error;
  2539			}
  2540	
  2541			if (!info) {
  2542				/* Bypass encapsulation if the destination is local */
  2543				err = encap_bypass_if_local(skb, dev, vxlan, dst,
  2544							    dst_port, ifindex, vni,
  2545							    &rt->dst, rt->rt_flags);
  2546				if (err)
  2547					goto out_unlock;
  2548	
  2549				if (vxlan->cfg.df == VXLAN_DF_SET) {
  2550					df = htons(IP_DF);
  2551				} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
  2552					struct ethhdr *eth = eth_hdr(skb);
  2553	
  2554					if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
  2555					    (ntohs(eth->h_proto) == ETH_P_IP &&
  2556					     old_iph->frag_off & htons(IP_DF)))
  2557						df = htons(IP_DF);
  2558				}
  2559			} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
  2560				df = htons(IP_DF);
  2561			}
  2562	
  2563			ndst = &rt->dst;
  2564			err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE),
  2565						    netif_is_any_bridge_port(dev));
  2566			if (err < 0) {
  2567				goto tx_error;
  2568			} else if (err) {
  2569				if (info) {
  2570					struct ip_tunnel_info *unclone;
  2571					struct in_addr src, dst;
  2572	
  2573					unclone = skb_tunnel_info_unclone(skb);
  2574					if (unlikely(!unclone))
  2575						goto tx_error;
  2576	
  2577					src = remote_ip.sin.sin_addr;
  2578					dst = local_ip.sin.sin_addr;
  2579					unclone->key.u.ipv4.src = src.s_addr;
  2580					unclone->key.u.ipv4.dst = dst.s_addr;
  2581				}
  2582				vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
  2583				dst_release(ndst);
  2584				goto out_unlock;
  2585			}
  2586	
  2587			tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
  2588			ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
  2589			err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
  2590					      vni, md, flags, udp_sum);
  2591			if (err < 0)
  2592				goto tx_error;
  2593	
  2594			udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
  2595					    dst->sin.sin_addr.s_addr, tos, ttl, df,
  2596					    src_port, dst_port, xnet, !udp_sum);
  2597	#if IS_ENABLED(CONFIG_IPV6)
  2598		} else {
  2599			struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
  2600	
  2601			if (!ifindex)
  2602				ifindex = sock6->sock->sk->sk_bound_dev_if;
  2603	
  2604			ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
  2605						label, &dst->sin6.sin6_addr,
  2606						&local_ip.sin6.sin6_addr,
  2607						dst_port, src_port,
  2608						dst_cache, info);
  2609			if (IS_ERR(ndst)) {
  2610				err = PTR_ERR(ndst);
  2611				ndst = NULL;
  2612				goto tx_error;
  2613			}
  2614	
  2615			if (!info) {
  2616				u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
  2617	
  2618				err = encap_bypass_if_local(skb, dev, vxlan, dst,
  2619							    dst_port, ifindex, vni,
  2620							    ndst, rt6i_flags);
  2621				if (err)
  2622					goto out_unlock;
  2623			}
  2624	
  2625			err = skb_tunnel_check_pmtu(skb, ndst,
  2626						    vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6),
  2627						    netif_is_any_bridge_port(dev));
  2628			if (err < 0) {
  2629				goto tx_error;
  2630			} else if (err) {
  2631				if (info) {
  2632					struct ip_tunnel_info *unclone;
  2633					struct in6_addr src, dst;
  2634	
  2635					unclone = skb_tunnel_info_unclone(skb);
  2636					if (unlikely(!unclone))
  2637						goto tx_error;
  2638	
  2639					src = remote_ip.sin6.sin6_addr;
  2640					dst = local_ip.sin6.sin6_addr;
  2641					unclone->key.u.ipv6.src = src;
  2642					unclone->key.u.ipv6.dst = dst;
  2643				}
  2644	
  2645				vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
  2646				dst_release(ndst);
  2647				goto out_unlock;
  2648			}
  2649	
  2650			tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
  2651			ttl = ttl ? : ip6_dst_hoplimit(ndst);
  2652			skb_scrub_packet(skb, xnet);
  2653			err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
  2654					      vni, md, flags, udp_sum);
  2655			if (err < 0)
  2656				goto tx_error;
  2657	
  2658			udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
  2659					     &local_ip.sin6.sin6_addr,
  2660					     &dst->sin6.sin6_addr, tos, ttl,
  2661					     label, src_port, dst_port, !udp_sum);
  2662	#endif
  2663		}
  2664		vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
  2665	out_unlock:
  2666		rcu_read_unlock();
  2667		return;
  2668	
  2669	drop:
  2670		dev->stats.tx_dropped++;
  2671		vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0);
  2672		dev_kfree_skb(skb);
  2673		return;
  2674	
  2675	tx_error:
  2676		rcu_read_unlock();
  2677		if (err == -ELOOP)
  2678			dev->stats.collisions++;
  2679		else if (err == -ENETUNREACH)
  2680			dev->stats.tx_carrier_errors++;
  2681		dst_release(ndst);
  2682		dev->stats.tx_errors++;
  2683		vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
  2684		kfree_skb(skb);
  2685	}
  2686	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] vxlan: add support for flowlabel inherit
  2023-09-30 14:28 [PATCH net-next] vxlan: add support for flowlabel inherit Alce Lafranque
                   ` (2 preceding siblings ...)
  2023-10-07 14:26 ` [PATCH net-next v2] " Alce Lafranque
@ 2023-10-07 17:09 ` Tom Herbert
  3 siblings, 0 replies; 10+ messages in thread
From: Tom Herbert @ 2023-10-07 17:09 UTC (permalink / raw)
  To: Alce Lafranque
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, Ido Schimmel, netdev, vincent

On Sat, Sep 30, 2023 at 8:14 AM Alce Lafranque <alce@lafranque.net> wrote:
>
> By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an
> option for a fixed value. This commits add the ability to inherit the flow
> label from the inner packet, like for other tunnel implementations.

Is there any reason not to make setting the flow label on by default?
We've been doing this for TCP for quite a while now and it's not
breaking the network.

Tom

>
> ```
> $ ./ip/ip addr add 2001:db8::2/64 dev dummy1
> $ ./ip/ip link set up dev dummy1
> $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2

Alce,

Is there any reason not to always set the flow label or at least make
setting the flow label be the default? We've been doing this for TCP
for quite a while now and it's not breaking the network, and we're
already setting the UDP source port with flow entropy in VXLAN anyway.

Tom

> $ ./ip/ip link set up dev vxlan1
> $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
> $ ./ip/ip link set arp off dev vxlan1
> $ ping -q 2001:db8:1::1 &
> $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
> [...]
> Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> [...]
> Virtual eXtensible Local Area Network
>     Flags: 0x0800, VXLAN Network ID (VNI)
>     Group Policy ID: 0
>     VXLAN Network Identifier (VNI): 100
> [...]
> Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> ```
>
> Signed-off-by: Alce Lafranque <alce@lafranque.net>
> Co-developed-by: Vincent Bernat <vincent@bernat.ch>
> Signed-off-by: Vincent Bernat <vincent@bernat.ch>
> ---
>  drivers/net/vxlan/vxlan_core.c | 20 ++++++++++++++++++--
>  include/net/ip_tunnels.h       | 11 +++++++++++
>  include/net/vxlan.h            |  2 ++
>  include/uapi/linux/if_link.h   |  1 +
>  4 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
> index 5b5597073b00..aa7fbfdd93b1 100644
> --- a/drivers/net/vxlan/vxlan_core.c
> +++ b/drivers/net/vxlan/vxlan_core.c
> @@ -2475,7 +2475,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>                 else
>                         udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
>  #if IS_ENABLED(CONFIG_IPV6)
> -               label = vxlan->cfg.label;
> +               if (flags & VXLAN_F_LABEL_INHERIT) {
> +                       label = ip_tunnel_get_flowlabel(old_iph, skb);
> +               } else {
> +                       label = vxlan->cfg.label;
> +               }
>  #endif
>         } else {
>                 if (!info) {
> @@ -3286,6 +3290,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>         [IFLA_VXLAN_DF]         = { .type = NLA_U8 },
>         [IFLA_VXLAN_VNIFILTER]  = { .type = NLA_U8 },
>         [IFLA_VXLAN_LOCALBYPASS]        = NLA_POLICY_MAX(NLA_U8, 1),
> +       [IFLA_VXLAN_LABEL_INHERIT]      = { .type = NLA_FLAG },
>  };
>
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
> @@ -4001,7 +4006,15 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
>
>         if (data[IFLA_VXLAN_LABEL])
>                 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
> -                            IPV6_FLOWLABEL_MASK;
> +                             IPV6_FLOWLABEL_MASK;
> +
> +       if (data[IFLA_VXLAN_LABEL_INHERIT]) {
> +               err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LABEL_INHERIT,
> +                                   VXLAN_F_LABEL_INHERIT, changelink, false,
> +                                   extack);
> +               if (err)
> +                       return err;
> +       }
>
>         if (data[IFLA_VXLAN_LEARNING]) {
>                 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
> @@ -4315,6 +4328,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_DF */
>                 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> +               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LABEL_INHERIT */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
>                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
> @@ -4387,6 +4401,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
>             nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
>             nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> +           nla_put_u8(skb, IFLA_VXLAN_LABEL_INHERIT,
> +                      !!(vxlan->cfg.flags & VXLAN_F_LABEL_INHERIT)) ||
>             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
>                        !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
>             nla_put_u8(skb, IFLA_VXLAN_PROXY,
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index f346b4efbc30..2d746f4c9a0a 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
>                 return 0;
>  }
>
> +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
> +                                            const struct sk_buff *skb)
> +{
> +       __be16 payload_protocol = skb_protocol(skb, true);
> +
> +       if (payload_protocol == htons(ETH_P_IPV6))
> +               return ip6_flowlabel((const struct ipv6hdr *)iph);
> +       else
> +               return 0;
> +}
> +
>  static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
>                                        const struct sk_buff *skb)
>  {
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 6a9f8a5f387c..f82ce013c8ff 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -329,6 +329,7 @@ struct vxlan_dev {
>  #define VXLAN_F_VNIFILTER               0x20000
>  #define VXLAN_F_MDB                    0x40000
>  #define VXLAN_F_LOCALBYPASS            0x80000
> +#define VXLAN_F_LABEL_INHERIT          0x100000
>
>  /* Flags that are used in the receive path. These flags must match in
>   * order for a socket to be shareable
> @@ -534,6 +535,7 @@ static inline void vxlan_flag_attr_error(int attrtype,
>                 break
>         switch (attrtype) {
>         VXLAN_FLAG(TTL_INHERIT);
> +       VXLAN_FLAG(LABEL_INHERIT);
>         VXLAN_FLAG(LEARNING);
>         VXLAN_FLAG(PROXY);
>         VXLAN_FLAG(RSC);
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index fac351a93aed..bd69af34feba 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -830,6 +830,7 @@ enum {
>         IFLA_VXLAN_DF,
>         IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
>         IFLA_VXLAN_LOCALBYPASS,
> +       IFLA_VXLAN_LABEL_INHERIT,
>         __IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
> --
> 2.39.2
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next v2] vxlan: add support for flowlabel inherit
  2023-10-07 14:26 ` [PATCH net-next v2] " Alce Lafranque
  2023-10-07 15:44   ` kernel test robot
@ 2023-10-11  7:11   ` Ido Schimmel
  1 sibling, 0 replies; 10+ messages in thread
From: Ido Schimmel @ 2023-10-11  7:11 UTC (permalink / raw)
  To: Alce Lafranque
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	David Ahern, netdev, Vincent Bernat

Process note: Please post new versions as a separate thread with a
changelog:
https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html#resending-after-review

On Sat, Oct 07, 2023 at 09:26:24AM -0500, Alce Lafranque wrote:
> By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with
> an option for a fixed value. This commits add the ability to inherit the
> flow label from the inner packet, like for other tunnel implementations.
> This enables devices using only L3 headers for ECMP to correctly balance
> VXLAN-encapsulated IPv6 packets.
> 
> ```
> $ ./ip/ip link add dummy1 type dummy
> $ ./ip/ip addr add 2001:db8::2/64 dev dummy1
> $ ./ip/ip link set up dev dummy1
> $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2
> $ ./ip/ip link set up dev vxlan1
> $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1
> $ ./ip/ip link set arp off dev vxlan1
> $ ping -q 2001:db8:1::1 &
> $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1
> [...]
> Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> [...]
> Virtual eXtensible Local Area Network
>     Flags: 0x0800, VXLAN Network ID (VNI)
>     Group Policy ID: 0
>     VXLAN Network Identifier (VNI): 100
> [...]
> Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1
>     0110 .... = Version: 6
>     .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
>         .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
>         .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
>     .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb
> ```
> 
> Signed-off-by: Alce Lafranque <alce@lafranque.net>
> Co-developed-by: Vincent Bernat <vincent@bernat.ch>
> Signed-off-by: Vincent Bernat <vincent@bernat.ch>
> ---
>  drivers/net/vxlan/vxlan_core.c | 15 ++++++++++++++-
>  include/net/ip_tunnels.h       | 11 +++++++++++
>  include/net/vxlan.h            | 33 +++++++++++++++++----------------
>  include/uapi/linux/if_link.h   |  8 ++++++++
>  4 files changed, 50 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
> index 5b5597073b00..d1f2376c0c73 100644
> --- a/drivers/net/vxlan/vxlan_core.c
> +++ b/drivers/net/vxlan/vxlan_core.c
> @@ -2475,7 +2475,14 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  		else
>  			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
>  #if IS_ENABLED(CONFIG_IPV6)
> -		label = vxlan->cfg.label;
> +		switch (vxlan->cfg.label_behavior) {
> +		case VXLAN_LABEL_FIXED:
> +			label = vxlan->cfg.label;
> +			break;
> +		case VXLAN_LABEL_INHERIT:
> +			label = ip_tunnel_get_flowlabel(old_iph, skb);
> +			break;

I saw the kbuild robot complaining about this. You can add:

default:
	DEBUG_NET_WARN_ON_ONCE(1);
	goto drop;

> +		}
>  #endif
>  	} else {
>  		if (!info) {
> @@ -3286,6 +3293,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>  	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
>  	[IFLA_VXLAN_VNIFILTER]	= { .type = NLA_U8 },
>  	[IFLA_VXLAN_LOCALBYPASS]	= NLA_POLICY_MAX(NLA_U8, 1),
> +	[IFLA_VXLAN_LABEL_BEHAVIOR]	= NLA_POLICY_MAX(NLA_U8, VXLAN_LABEL_MAX),

My preference would be IFLA_VXLAN_LABEL_POLICY.

>  };
>  
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
> @@ -4003,6 +4011,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
>  		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
>  			     IPV6_FLOWLABEL_MASK;
>  
> +	if (data[IFLA_VXLAN_LABEL_BEHAVIOR])
> +		conf->label_behavior = nla_get_u8(data[IFLA_VXLAN_LABEL_BEHAVIOR]);

There is a check in vxlan_config_validate() that prevents setting a
non-zero flow label when the VXLAN device encapsulates using IPv4. For
consistency it would be good to include a similar check regarding the
flow label policy.

> +
>  	if (data[IFLA_VXLAN_LEARNING]) {
>  		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
>  				    VXLAN_F_LEARN, changelink, true,
> @@ -4315,6 +4326,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
>  		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
>  		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
>  		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LABEL_BEHAVIOR */
>  		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
>  		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
>  		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
> @@ -4387,6 +4399,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>  	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
>  	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
>  	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_LABEL_BEHAVIOR, vxlan->cfg.label_behavior) ||
>  	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
>  		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
>  	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index f346b4efbc30..2d746f4c9a0a 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
>  		return 0;
>  }
>  
> +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph,
> +					     const struct sk_buff *skb)
> +{
> +	__be16 payload_protocol = skb_protocol(skb, true);
> +
> +	if (payload_protocol == htons(ETH_P_IPV6))
> +		return ip6_flowlabel((const struct ipv6hdr *)iph);
> +	else
> +		return 0;
> +}
> +
>  static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
>  				       const struct sk_buff *skb)
>  {
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 6a9f8a5f387c..9ccbc8b7b8f9 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -210,22 +210,23 @@ struct vxlan_rdst {
>  };
>  
>  struct vxlan_config {
> -	union vxlan_addr	remote_ip;
> -	union vxlan_addr	saddr;
> -	__be32			vni;
> -	int			remote_ifindex;
> -	int			mtu;
> -	__be16			dst_port;
> -	u16			port_min;
> -	u16			port_max;
> -	u8			tos;
> -	u8			ttl;
> -	__be32			label;
> -	u32			flags;
> -	unsigned long		age_interval;
> -	unsigned int		addrmax;
> -	bool			no_share;
> -	enum ifla_vxlan_df	df;
> +	union vxlan_addr		remote_ip;
> +	union vxlan_addr		saddr;
> +	__be32				vni;
> +	int				remote_ifindex;
> +	int				mtu;
> +	__be16				dst_port;
> +	u16				port_min;
> +	u16				port_max;
> +	u8				tos;
> +	u8				ttl;
> +	__be32				label;
> +	enum ifla_vxlan_label_behavior	label_behavior;
> +	u32				flags;
> +	unsigned long			age_interval;
> +	unsigned int			addrmax;
> +	bool				no_share;
> +	enum ifla_vxlan_df		df;
>  };
>  
>  enum {
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index fac351a93aed..13afc4afcc76 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -830,6 +830,7 @@ enum {
>  	IFLA_VXLAN_DF,
>  	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
>  	IFLA_VXLAN_LOCALBYPASS,
> +	IFLA_VXLAN_LABEL_BEHAVIOR,
>  	__IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
> @@ -847,6 +848,13 @@ enum ifla_vxlan_df {
>  	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
>  };
>  
> +enum ifla_vxlan_label_behavior {
> +	VXLAN_LABEL_FIXED = 0,
> +	VXLAN_LABEL_INHERIT = 1,
> +	__VXLAN_LABEL_END,
> +	VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1,
> +};
> +
>  /* GENEVE section */
>  enum {
>  	IFLA_GENEVE_UNSPEC,
> -- 
> 2.39.2
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-10-11  7:11 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-09-30 14:28 [PATCH net-next] vxlan: add support for flowlabel inherit Alce Lafranque
2023-09-30 15:29 ` Eric Dumazet
2023-09-30 18:16   ` alce
2023-10-01 21:21   ` Vincent Bernat
2023-10-03 10:59 ` Ido Schimmel
2023-10-07 14:46   ` alce
2023-10-07 14:26 ` [PATCH net-next v2] " Alce Lafranque
2023-10-07 15:44   ` kernel test robot
2023-10-11  7:11   ` Ido Schimmel
2023-10-07 17:09 ` [PATCH net-next] " Tom Herbert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).