Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2 V3 2/3] tc/cls_flower: Classify packet in ip tunnels
From: Amir Vadai @ 2016-11-30  7:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Jiri Pirko,
	Jiri Benc, Amir Vadai
In-Reply-To: <20161130073840.5269-1-amir@vadai.me>

Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++++++++++-
 tc/f_flower.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..173cfc20f90b 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
 	fprintf(stderr, "                       dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
 	fprintf(stderr, "                       src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
 	fprintf(stderr, "                       dst_port PORT-NUMBER |\n");
-	fprintf(stderr, "                       src_port PORT-NUMBER }\n");
+	fprintf(stderr, "                       src_port PORT-NUMBER |\n");
+	fprintf(stderr, "                       enc_dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
+	fprintf(stderr, "                       enc_src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
+	fprintf(stderr, "                       enc_key_id [ KEY-ID ] }\n");
 	fprintf(stderr,	"       FILTERID := X:Y:Z\n");
 	fprintf(stderr,	"       ACTION-SPEC := ... look at individual actions\n");
 	fprintf(stderr,	"\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 		family = AF_INET;
 	} else if (eth_type == htons(ETH_P_IPV6)) {
 		family = AF_INET6;
+	} else if (!eth_type) {
+		family = AF_UNSPEC;
 	} else {
-		fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
 		return -1;
 	}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 	if (ret)
 		return -1;
 
-	if (addr.family != family)
+	if (family && (addr.family != family)) {
+		fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
 		return -1;
+	}
 
 	addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
 		  addr.data, addr.bytelen);
@@ -181,6 +187,18 @@ static int flower_parse_port(char *str, __u8 ip_port,
 	return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+	int ret;
+	__be32 key_id;
+
+	ret = get_be32(&key_id, str, 10);
+	if (!ret)
+		addattr32(n, MAX_MSG, type, key_id);
+
+	return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
 			    int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +357,38 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
 				fprintf(stderr, "Illegal \"src_port\"\n");
 				return -1;
 			}
+		} else if (matches(*argv, "enc_dst_ip") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_ip_addr(*argv, 0,
+						   TCA_FLOWER_KEY_ENC_IPV4_DST,
+						   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+						   TCA_FLOWER_KEY_ENC_IPV6_DST,
+						   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+						   n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "enc_src_ip") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_ip_addr(*argv, 0,
+						   TCA_FLOWER_KEY_ENC_IPV4_SRC,
+						   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+						   TCA_FLOWER_KEY_ENC_IPV6_SRC,
+						   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+						   n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_src_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "enc_key_id") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_key_id(*argv,
+						  TCA_FLOWER_KEY_ENC_KEY_ID, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_key_id\"\n");
+				return -1;
+			}
 		} else if (matches(*argv, "action") == 0) {
 			NEXT_ARG();
 			ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n);
@@ -506,7 +556,14 @@ static void flower_print_port(FILE *f, char *name, __u8 ip_proto,
 		return;
 	if (!attr)
 		return;
-	fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
+	fprintf(f, "\n  %s %d", name, rta_getattr_be16(attr));
+}
+
+static void flower_print_key_id(FILE *f, const char *name,
+				struct rtattr *attr)
+{
+	if (attr)
+		fprintf(f, "\n  %s %d", name, rta_getattr_be32(attr));
 }
 
 static int flower_print_opt(struct filter_util *qu, FILE *f,
@@ -577,6 +634,25 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 			  tb[TCA_FLOWER_KEY_TCP_SRC],
 			  tb[TCA_FLOWER_KEY_UDP_SRC]);
 
+	flower_print_ip_addr(f, "enc_dst_ip",
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] ?
+			     htons(ETH_P_IP) : htons(ETH_P_IPV6),
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST],
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_DST],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK]);
+
+	flower_print_ip_addr(f, "enc_src_ip",
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] ?
+			     htons(ETH_P_IP) : htons(ETH_P_IPV6),
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC],
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_SRC],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK]);
+
+	flower_print_key_id(f, "enc_key_id",
+			    tb[TCA_FLOWER_KEY_ENC_KEY_ID]);
+
 	if (tb[TCA_FLOWER_FLAGS])  {
 		__u32 flags = rta_getattr_u32(tb[TCA_FLOWER_FLAGS]);
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH iproute2 V3 1/3] libnetlink: Introduce rta_getattr_be*()
From: Amir Vadai @ 2016-11-30  7:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Jiri Pirko,
	Jiri Benc, Amir Vadai
In-Reply-To: <20161130073840.5269-1-amir@vadai.me>

Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 bridge/fdb.c         | 4 ++--
 include/libnetlink.h | 9 +++++++++
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c    | 2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	if (tb[NDA_PORT]) {
 		if (jw_global)
 			jsonw_uint_field(jw_global, "port",
-					 ntohs(rta_getattr_u16(tb[NDA_PORT])));
+					 rta_getattr_be16(tb[NDA_PORT]));
 		else
 			fprintf(fp, "port %d ",
-				ntohs(rta_getattr_u16(tb[NDA_PORT])));
+				rta_getattr_be16(tb[NDA_PORT]));
 	}
 
 	if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
 #include <linux/netconf.h>
+#include <arpa/inet.h>
 
 struct rtnl_handle {
 	int			fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr *rta)
 {
 	return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+	return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
 	return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+	return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
 	__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 
 	if (tb[IFLA_GENEVE_PORT])
 		fprintf(f, "dstport %u ",
-			ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+			rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
 	if (tb[IFLA_GENEVE_COLLECT_METADATA])
 		fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 
 	if (tb[IFLA_VXLAN_PORT])
 		fprintf(f, "dstport %u ",
-			ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+			rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
 	if (tb[IFLA_VXLAN_LEARNING] &&
 	    !rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
-- 
2.10.2

^ permalink raw reply related

* [PATCH iproute2 V3 0/3] tc: Support for ip tunnel metadata set/unset/classify
From: Amir Vadai @ 2016-11-30  7:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Jiri Pirko,
	Jiri Benc, Amir Vadai

Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
	and the man page tc-tunnel_key to reflect the fact that 'unset' operation is
	no mandatory.
	And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c                         |   4 +-
 include/libnetlink.h                 |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++++++
 ip/iplink_geneve.c                   |   2 +-
 ip/iplink_vxlan.c                    |   2 +-
 man/man8/tc-flower.8                 |  17 ++-
 man/man8/tc-tunnel_key.8             | 113 +++++++++++++++
 tc/Makefile                          |   1 +
 tc/f_flower.c                        |  84 +++++++++++-
 tc/m_tunnel_key.c                    | 258 +++++++++++++++++++++++++++++++++++
 10 files changed, 523 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2

^ permalink raw reply

* Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels
From: Amir Vadai @ 2016-11-30  7:42 UTC (permalink / raw)
  To: Stephen Hemminger, Jiri Pirko
  Cc: Linux Netdev List, David S. Miller, Jiri Benc, Or Gerlitz,
	Hadar Har-Zion, Roi Dayan
In-Reply-To: <20161130071753.GA1366@office.localdomain>

On Wed, Nov 30, 2016 at 9:17 AM, Amir Vadai <amir@vadai.me> wrote:
> On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
(Sending again since I just discovered that Google Inbox is adding an
HTML part...)

>> The overall design is fine, just a couple nits with the code.
>>
>> > diff --git a/tc/f_flower.c b/tc/f_flower.c
>> > index 2d31d1aa832d..1cf0750b5b83 100644
>> > --- a/tc/f_flower.c
>> > +++ b/tc/f_flower.c
>>
>> >
>> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
>>
>> str is not modified, therefore use: const char *str
> ack
>
>>
>> > +{
>> > +   int ret;
>> > +   __be32 key_id;
>> > +
>> > +   ret = get_be32(&key_id, str, 10);
>> > +   if (ret)
>> > +           return -1;
>>
>> Traditionally netlink attributes are in host order, why was flower
>> chosen to be different?
> I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
> flower code.
Now the right Jiri (Pirko) is CC'ed

>
>>
>> > +
>> > +   addattr32(n, MAX_MSG, type, key_id);
>> > +
>> > +   return 0;
>>
>>
>> Why lose the return value here?  Why not:
>>
>>       ret = get_be32(&key_id, str, 10);
>>       if (!ret)
>>               addattr32(n, MAX_MSG, type, key_id);
> The get_*() function can return only -1 or 0. But you are right, and it
> is better the way you suggested.  Changing accordingly in V3.
>
>>
>> > +}
>> > +
>> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
>> >                         int argc, char **argv, struct nlmsghdr *n)
>> >  {
>> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
>> >                             fprintf(stderr, "Illegal \"src_port\"\n");
>> >                             return -1;
>> >                     }
>> > +           } else if (matches(*argv, "enc_dst_ip") == 0) {
>> > +                   NEXT_ARG();
>> > +                   ret = flower_parse_ip_addr(*argv, 0,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV4_DST,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV6_DST,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
>> > +                                              n);
>> > +                   if (ret < 0) {
>> > +                           fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
>> > +                           return -1;
>> > +                   }
>> > +           } else if (matches(*argv, "enc_src_ip") == 0) {
>> > +                   NEXT_ARG();
>> > +                   ret = flower_parse_ip_addr(*argv, 0,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV4_SRC,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
>> > +                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
>> > +                                              n);
>> > +                   if (ret < 0) {
>> > +                           fprintf(stderr, "Illegal \"enc_src_ip\"\n");
>> > +                           return -1;
>> > +                   }
>> > +           } else if (matches(*argv, "enc_key_id") == 0) {
>> > +                   NEXT_ARG();
>> > +                   ret = flower_parse_key_id(*argv,
>> > +                                             TCA_FLOWER_KEY_ENC_KEY_ID, n);
>> > +                   if (ret < 0) {
>> > +                           fprintf(stderr, "Illegal \"enc_key_id\"\n");
>> > +                           return -1;
>> > +                   }
>> >             } else if (matches(*argv, "action") == 0) {
>> >                     NEXT_ARG();
>> >                     ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n);
>> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, __u8 ip_proto,
>> >     fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
>> >  }
>> >
>> > +static void flower_print_key_id(FILE *f, char *name,
>> > +                           struct rtattr *attr)
>>
>> const char *name?
> ack
>
>>
>>
>> > +{
>> > +   if (!attr)
>> > +           return;
>> > +   fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
>> > +}
>> > +
>>
>> Why short circuit, just change the order:
>>
>>       if (attr)
>>               fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
>>
>> You might also want to introduce rta_getattr_be32()
> ack
>
>>
>> Please change, retest and resubmit both patches.
> ack
>
> Thanks for reviewing,
> Amir

^ permalink raw reply

* RE: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer
From: Andy Duan @ 2016-11-30  7:35 UTC (permalink / raw)
  To: Nikita Yushchenko, David S. Miller, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org
  Cc: Chris Healy, Fabio Estevam, linux-kernel@vger.kernel.org
In-Reply-To: <dab27a3d-821c-8d07-ad98-2bdd8c442bc5@cogentembedded.com>

From: Nikita Yushchenko <nikita.yoush@cogentembedded.com> Sent: Wednesday, November 30, 2016 2:36 PM
 >To: Andy Duan <fugang.duan@nxp.com>; David S. Miller
 ><davem@davemloft.net>; Troy Kisky <troy.kisky@boundarydevices.com>;
 >Andrew Lunn <andrew@lunn.ch>; Eric Nelson <eric@nelint.com>; Philippe
 >Reynes <tremyfr@gmail.com>; Johannes Berg <johannes@sipsolutions.net>;
 >netdev@vger.kernel.org
 >Cc: Chris Healy <cphealy@gmail.com>; Fabio Estevam
 ><fabio.estevam@nxp.com>; linux-kernel@vger.kernel.org
 >Subject: Re: [patch net / RFC] net: fec: increase frame size limitation to
 >actually available buffer
 >
 >> But I think it is not necessary since the driver don't support jumbo frame.
 >
 >Hardcoded 1522 raises two separate issues.
 >
 >(1) When DSA is in use, frames processed by FEC chip contain DSA tag and
 >thus can be larger than hardcoded limit of 1522. This issue is not FEC-specific,
 >any driver that hardcodes maximum frame size to 1522 (many
 >do) will have this issue if used with DSA.
 >
 >Clean solution for this must take into account that difference between MTU
 >and max frame size is no longer known at compile time. Actually this is the
 >case even without DSA, due to VLANs: max frame size is (MTU + 18) without
 >VLANs, but (MTU + 22) with VLANs. However currently drivers tend to ignore
 >this and hardcode 22.  With DSA, 22 is not enough, need to add switch-
 >specific tag size to that.
 >
 >Not yet sure how to handle this. DSA-specific API to find out tag size could be
 >added, but generic solution should handle all cases of dynamic difference
 >between MTU and max frame size, not only DSA.
 >
 >
 >(2) There is some demand to use larger frames for optimization purposes.
 >
 >FEC register fields that limit frame size are 14-bit, thus allowing frames up to
 >(4k-1). I'm about to prepare a larger patch:
 >- add ndo_change_mtu handler, allowing MTU up to (4k - overhead),
 >- set MAX_FL / TRUNC_FL based on configured MTU,
 >- if necessary, do buffer reallocation with larger buffers.
 >
 >Is this suitable for upstreaming?
 >Is there any policy related to handling larger frames?

Of course, welcome to upstream the jumbo frame patches, but hope to also add the transmit jumbo frame, not only receive path, which is helpful for testing with two boards connection.
And, some notes need you to care:
- the maximum jumbo frame should consider the fifo size. Different chip has different fifo size. Like i.MX53 tx and rx share one fifo, i.mx6q/dl/sl have separate 4k fifo for tx and rx, i.mx6sx/i.mx7x have separate 8k fifo for tx and rx.
- rx fifo watermark to generate pause frame in busy loading system to avoid fifo overrun. In general,  little pause frames bring better performance, mass of pause frames cause worse performance.


Regards,
Andy

^ permalink raw reply

* Re: [PATCH v3 net-next 3/3] openvswitch: Fix skb->protocol for vlan frames.
From: Pravin Shelar @ 2016-11-30  7:34 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: Linux Kernel Network Developers, Jiri Benc, Eric Garver
In-Reply-To: <1480462253-114713-3-git-send-email-jarno@ovn.org>

On Tue, Nov 29, 2016 at 3:30 PM, Jarno Rajahalme <jarno@ovn.org> wrote:
> Do not always set skb->protocol to be the ethertype of the L3 header.
> For a packet with non-accelerated VLAN tags skb->protocol needs to be
> the ethertype of the outermost non-accelerated VLAN ethertype.
>
> Any VLAN offloading is undone on the OVS netlink interface, and any
> VLAN tags added by userspace are non-accelerated, as are double tagged
> VLAN packets.
>
> Fixes: 018c1dda5f ("openvswitch: 802.1AD Flow handling, actions, vlan parsing, netlink attributes")
> Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
> Signed-off-by: Jarno Rajahalme <jarno@ovn.org>

Looks much better now. Thanks for fixing it. I have one minor comment.

Acked-by: Pravin B Shelar <pshelar@ovn.org>

> ---
> v3: Set skb->protocol properly also for double tagged packets, as suggested
>     by Pravin.  This patch is no longer needed for the MTU test failure, as
>     the new patch 2/3 addresses that.
>
>  net/openvswitch/datapath.c |  1 -
>  net/openvswitch/flow.c     | 62 +++++++++++++++++++++++-----------------------
>  2 files changed, 31 insertions(+), 32 deletions(-)
>
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index 08aa926..e2fe2e5 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -354,6 +354,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
>                 res = parse_vlan_tag(skb, &key->eth.vlan);
>                 if (res <= 0)
>                         return res;
> +               skb->protocol = key->eth.vlan.tpid;
>         }
>
>         /* Parse inner vlan tag. */
> @@ -361,6 +362,11 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
>         if (res <= 0)
>                 return res;
>
> +       /* If the outer vlan tag was accelerated, skb->protocol should
> +        * refelect the inner vlan type. */
> +       if (!eth_type_vlan(skb->protocol))

Since you would be spinning another version, can you change this
condition to directly check for skb-vlan-tag rather than indirectly
checking for the vlan accelerated case?

^ permalink raw reply

* Re: [WIP] net+mlx4: auto doorbell
From: Alexei Starovoitov @ 2016-11-30  7:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, Rick Jones, netdev@vger.kernel.org,
	Saeed Mahameed, Tariq Toukan

On Mon, Nov 28, 2016 at 10:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>  {
> @@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
>         wmb();
>
>         /* we want to dirty this cache line once */
> -       ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
> -       ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
> +       WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
> +       ring_cons += txbbs_skipped;
> +       WRITE_ONCE(ring->cons, ring_cons);
> +       WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
> +
> +       if (dev->doorbell_opt)
> +               mlx4_en_xmit_doorbell(dev, ring);
...
> +       /* Doorbell avoidance : We can omit doorbell if we know a TX completion
> +        * will happen shortly.
> +        */
> +       if (send_doorbell &&
> +           dev->doorbell_opt &&
> +           (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
> +               send_doorbell = false;

this is awesome idea!
I'm missing though why you need all these read_once/write_once.
It's a tx ring that is already protected by tx queue lock
or completion is happening without grabbing the lock?
Then how do we make sure that we don't race here
and indeed above prod_bell - ncons > 0 condition is safe?
Good description of algorithm would certainly help ;)

^ permalink raw reply

* Re: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer
From: Toshiaki Makita @ 2016-11-30  7:25 UTC (permalink / raw)
  To: Nikita Yushchenko, Andy Duan, David S. Miller, Troy Kisky,
	Andrew Lunn, Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org
  Cc: Chris Healy, Fabio Estevam, linux-kernel@vger.kernel.org
In-Reply-To: <dab27a3d-821c-8d07-ad98-2bdd8c442bc5@cogentembedded.com>

On 2016/11/30 15:36, Nikita Yushchenko wrote:
>> But I think it is not necessary since the driver don't support jumbo frame.
> 
> Hardcoded 1522 raises two separate issues.
> 
> (1) When DSA is in use, frames processed by FEC chip contain DSA tag and
> thus can be larger than hardcoded limit of 1522. This issue is not
> FEC-specific, any driver that hardcodes maximum frame size to 1522 (many
> do) will have this issue if used with DSA.
> 
> Clean solution for this must take into account that difference between
> MTU and max frame size is no longer known at compile time. Actually this
> is the case even without DSA, due to VLANs: max frame size is (MTU + 18)
> without VLANs, but (MTU + 22) with VLANs. However currently drivers tend
> to ignore this and hardcode 22.  With DSA, 22 is not enough, need to add
> switch-specific tag size to that.
> 
> Not yet sure how to handle this. DSA-specific API to find out tag size
> could be added, but generic solution should handle all cases of dynamic
> difference between MTU and max frame size, not only DSA.

BTW I'm trying to introduce envelope frames to solve this kind of problems.
http://marc.info/?t=147496691500005&r=1&w=2
http://marc.info/?t=147496691500003&r=1&w=2
http://marc.info/?t=147496691500002&r=1&w=2
http://marc.info/?t=147496691500004&r=1&w=2
http://marc.info/?t=147496691500001&r=1&w=2

It needs jumbo frame support of NICs though.

Regards,
Toshiaki Makita

^ permalink raw reply

* Re: [PATCH v3 net-next 2/3] openvswitch: Use is_skb_forwardable() for length check.
From: Pravin Shelar @ 2016-11-30  7:23 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: Linux Kernel Network Developers, Jiri Benc, Eric Garver
In-Reply-To: <1480462253-114713-2-git-send-email-jarno@ovn.org>

On Tue, Nov 29, 2016 at 3:30 PM, Jarno Rajahalme <jarno@ovn.org> wrote:
> Use is_skb_forwardable() instead of an explicit length check.  This
> gets around the apparent MTU check failure in OVS test cases when
> skb->protocol is not properly set in case of non-accelerated VLAN
> skbs.
>
> Suggested-by: Pravin Shelar <pshelar@ovn.org>
> Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
> Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
> ---
> v3: New patch suggested by Pravin.
>
> net/openvswitch/vport.c | 38 ++++++++++++++------------------------
>  1 file changed, 14 insertions(+), 24 deletions(-)
>
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index b6c8524..076b39f 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c

>  void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
>  {
> -       int mtu = vport->dev->mtu;
> -
>         switch (vport->dev->type) {
>         case ARPHRD_NONE:
>                 if (mac_proto == MAC_PROTO_ETHERNET) {
> @@ -504,11 +485,20 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
>                 goto drop;
>         }
>
> -       if (unlikely(packet_length(skb, vport->dev) > mtu &&
> -                    !skb_is_gso(skb))) {
> -               net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
> -                                    vport->dev->name,
> -                                    packet_length(skb, vport->dev), mtu);
> +       if (unlikely(!is_skb_forwardable(vport->dev, skb))) {

This would easy to read if you inverse the if condition here.

> +               /* Log only if the device is up. */
> +               if (vport->dev->flags & IFF_UP) {

since is_skb_forwardable() is checking for IFF_UP we can remove same
check from internal-device send() op.

> +                       unsigned int length = skb->len
> +                               - vport->dev->hard_header_len;
> +
> +                       if (!skb_vlan_tag_present(skb)
> +                           && eth_type_vlan(skb->protocol))
> +                               length -= VLAN_HLEN;
> +
> +                       net_warn_ratelimited("%s: dropped over-mtu packet %d > %d\n",
> +                                            vport->dev->name, length,
> +                                            vport->dev->mtu);
> +               }
>                 vport->dev->stats.tx_errors++;
>                 goto drop;
>         }
> --
> 2.1.4
>

^ permalink raw reply

* Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels
From: Amir Vadai @ 2016-11-30  7:17 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Jiri Benc, Or Gerlitz, Hadar Har-Zion,
	Roi Dayan
In-Reply-To: <20161129192658.5a4c3e2c@samsung9.wavecable.com>

On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
> The overall design is fine, just a couple nits with the code.
> 
> > diff --git a/tc/f_flower.c b/tc/f_flower.c
> > index 2d31d1aa832d..1cf0750b5b83 100644
> > --- a/tc/f_flower.c
> > +++ b/tc/f_flower.c
> 
> >  
> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
> 
> str is not modified, therefore use: const char *str
ack

> 
> > +{
> > +	int ret;
> > +	__be32 key_id;
> > +
> > +	ret = get_be32(&key_id, str, 10);
> > +	if (ret)
> > +		return -1;
> 
> Traditionally netlink attributes are in host order, why was flower
> chosen to be different?
I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
flower code.

> 
> > +
> > +	addattr32(n, MAX_MSG, type, key_id);
> > +
> > +	return 0;
> 
> 
> Why lose the return value here?  Why not:
> 
> 	ret = get_be32(&key_id, str, 10);
> 	if (!ret)
> 		addattr32(n, MAX_MSG, type, key_id);
The get_*() function can return only -1 or 0. But you are right, and it
is better the way you suggested.  Changing accordingly in V3.

> 
> > +}
> > +
> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
> >  			    int argc, char **argv, struct nlmsghdr *n)
> >  {
> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
> >  				fprintf(stderr, "Illegal \"src_port\"\n");
> >  				return -1;
> >  			}
> > +		} else if (matches(*argv, "enc_dst_ip") == 0) {
> > +			NEXT_ARG();
> > +			ret = flower_parse_ip_addr(*argv, 0,
> > +						   TCA_FLOWER_KEY_ENC_IPV4_DST,
> > +						   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
> > +						   TCA_FLOWER_KEY_ENC_IPV6_DST,
> > +						   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
> > +						   n);
> > +			if (ret < 0) {
> > +				fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
> > +				return -1;
> > +			}
> > +		} else if (matches(*argv, "enc_src_ip") == 0) {
> > +			NEXT_ARG();
> > +			ret = flower_parse_ip_addr(*argv, 0,
> > +						   TCA_FLOWER_KEY_ENC_IPV4_SRC,
> > +						   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
> > +						   TCA_FLOWER_KEY_ENC_IPV6_SRC,
> > +						   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
> > +						   n);
> > +			if (ret < 0) {
> > +				fprintf(stderr, "Illegal \"enc_src_ip\"\n");
> > +				return -1;
> > +			}
> > +		} else if (matches(*argv, "enc_key_id") == 0) {
> > +			NEXT_ARG();
> > +			ret = flower_parse_key_id(*argv,
> > +						  TCA_FLOWER_KEY_ENC_KEY_ID, n);
> > +			if (ret < 0) {
> > +				fprintf(stderr, "Illegal \"enc_key_id\"\n");
> > +				return -1;
> > +			}
> >  		} else if (matches(*argv, "action") == 0) {
> >  			NEXT_ARG();
> >  			ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n);
> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, __u8 ip_proto,
> >  	fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
> >  }
> >  
> > +static void flower_print_key_id(FILE *f, char *name,
> > +				struct rtattr *attr)
> 
> const char *name?
ack

> 
> 
> > +{
> > +	if (!attr)
> > +		return;
> > +	fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
> > +}
> > +
> 
> Why short circuit, just change the order:
> 
> 	if (attr)
> 		fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
> 
> You might also want to introduce rta_getattr_be32()
ack

> 
> Please change, retest and resubmit both patches.
ack

Thanks for reviewing,
Amir

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Alexei Starovoitov @ 2016-11-30  7:01 UTC (permalink / raw)
  To: Thomas Graf; +Cc: davem, netdev, daniel, tom, roopa, hannes
In-Reply-To: <20161130064850.GB16856@pox.localdomain>

On Wed, Nov 30, 2016 at 07:48:51AM +0100, Thomas Graf wrote:
> On 11/29/16 at 04:15pm, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> > ...
> > > +#define LWT_BPF_MAX_HEADROOM 128
> > 
> > why 128?
> > btw I'm thinking for XDP to use 256, so metadata can be stored in there.
> 
> It's an arbitrary limit to catch obvious misconfiguration. I'm absolutely
> fine with bumping it to 256.
> 
> > > +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
> > > +		       struct dst_entry *dst, bool can_redirect)
> > > +{
> > > +	int ret;
> > > +
> > > +	/* Preempt disable is needed to protect per-cpu redirect_info between
> > > +	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
> > > +	 * access to maps strictly require a rcu_read_lock() for protection,
> > > +	 * mixing with BH RCU lock doesn't work.
> > > +	 */
> > > +	preempt_disable();
> > > +	rcu_read_lock();
> > > +	bpf_compute_data_end(skb);
> > > +	ret = BPF_PROG_RUN(lwt->prog, skb);
> > > +	rcu_read_unlock();
> > > +
> > > +	switch (ret) {
> > > +	case BPF_OK:
> > > +		break;
> > > +
> > > +	case BPF_REDIRECT:
> > > +		if (!can_redirect) {
> > > +			WARN_ONCE(1, "Illegal redirect return code in prog %s\n",
> > > +				  lwt->name ? : "<unknown>");
> > > +			ret = BPF_OK;
> > > +		} else {
> > > +			ret = skb_do_redirect(skb);
> > 
> > I think this assumes that program did bpf_skb_push and L2 header is present.
> > Would it make sense to check that mac_header < network_header here to make
> > sure that it actually happened? I think the cost of single 'if' isn't much.
> > Also skb_do_redirect() can redirect to l3 tunnels like ipip ;)
> > so program shouldn't be doing bpf_skb_push in such case...
> 
> We are currently guaranteeing mac_header <= network_header given that
> bpf_skb_push() is calling skb_reset_mac_header() unconditionally.
> 
> Even if a program were to push an L2 header and then redirect to an l3
> tunnel, __bpf_redirect_no_mac will pull it off again and correct the
> mac_header offset.

yes. that part is fine.

> Should we check in __bpf_redirect_common() whether mac_header <
> nework_header then or add it to lwt-bpf conditional on
> dev_is_mac_header_xmit()?

may be only extra 'if' in lwt-bpf is all we need?
I'm still missing what will happen if we 'forget' to do
bpf_skb_push() inside the lwt-bpf program, but still do redirect
in lwt_xmit stage to l2 netdev...

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] bpf: Add tests and samples for LWT-BPF
From: Thomas Graf @ 2016-11-30  6:52 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, netdev, daniel, tom, roopa, hannes
In-Reply-To: <20161130001750.GB28238@ast-mbp.thefacebook.com>

On 11/29/16 at 04:17pm, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 02:21:23PM +0100, Thomas Graf wrote:
> > Adds a series of test to verify the functionality of attaching
> > BPF programs at LWT hooks.
> > 
> > Also adds a sample which collects a histogram of packet sizes which
> > pass through an LWT hook.
> > 
> > $ ./lwt_len_hist.sh
> > Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.253.2 () port 0 AF_INET : demo
> > Recv   Send    Send
> > Socket Socket  Message  Elapsed
> > Size   Size    Size     Time     Throughput
> > bytes  bytes   bytes    secs.    10^6bits/sec
> > 
> >  87380  16384  16384    10.00    39857.69
> 
> Nice!
> 
> > +	ret = bpf_redirect(ifindex, 0);
> > +	if (ret < 0) {
> > +		printk("bpf_redirect() failed: %d\n", ret);
> > +		return BPF_DROP;
> > +	}
> 
> this 'if' looks a bit weird. You're passing 0 as flags,
> so this helper will always succeed.
> Other sample code often does 'return bpf_redirect(...)'
> due to this reasoning.

Right, the if branch is absolutely useless. I will remove it.

^ permalink raw reply

* Re: [PATCH] netns: avoid disabling irq for netns id
From: Cong Wang @ 2016-11-30  6:51 UTC (permalink / raw)
  To: Paul Moore; +Cc: Linux Kernel Network Developers, linux-audit
In-Reply-To: <CAGH-KgsK=Dc=Ptfg8P39pemFAsxVr9NKccev7sOW=w1suqKk+w@mail.gmail.com>

On Tue, Nov 29, 2016 at 2:14 PM, Paul Moore <pmoore@redhat.com> wrote:
> On Tue, Nov 29, 2016 at 5:11 PM, Paul Moore <pmoore@redhat.com> wrote:
>> From: Paul Moore <paul@paul-moore.com>
>>
>> Bring back commit bc51dddf98c9 ("netns: avoid disabling irq for netns
>> id") now that we've fixed some audit multicast issues that caused
>> problems with original attempt.  Additional information, and history,
>> can be found in the links below:
>>
>>  * https://github.com/linux-audit/audit-kernel/issues/22
>>  * https://github.com/linux-audit/audit-kernel/issues/23
>>
>> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
>> Signed-off-by: Paul Moore <paul@paul-moore.com>
>> ---
>>  net/core/net_namespace.c |   35 +++++++++++++++--------------------
>>  1 file changed, 15 insertions(+), 20 deletions(-)
>
> Cong Wang, I added your sign-off to the patch since you were the
> original author, if you would prefer I leave it off or want it changed
> just let me know.

Thanks for not forgetting it. I am fine with signed-off-by.

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Thomas Graf @ 2016-11-30  6:48 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, netdev, daniel, tom, roopa, hannes
In-Reply-To: <20161130001504.GA28238@ast-mbp.thefacebook.com>

On 11/29/16 at 04:15pm, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> ...
> > +#define LWT_BPF_MAX_HEADROOM 128
> 
> why 128?
> btw I'm thinking for XDP to use 256, so metadata can be stored in there.

It's an arbitrary limit to catch obvious misconfiguration. I'm absolutely
fine with bumping it to 256.

> > +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
> > +		       struct dst_entry *dst, bool can_redirect)
> > +{
> > +	int ret;
> > +
> > +	/* Preempt disable is needed to protect per-cpu redirect_info between
> > +	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
> > +	 * access to maps strictly require a rcu_read_lock() for protection,
> > +	 * mixing with BH RCU lock doesn't work.
> > +	 */
> > +	preempt_disable();
> > +	rcu_read_lock();
> > +	bpf_compute_data_end(skb);
> > +	ret = BPF_PROG_RUN(lwt->prog, skb);
> > +	rcu_read_unlock();
> > +
> > +	switch (ret) {
> > +	case BPF_OK:
> > +		break;
> > +
> > +	case BPF_REDIRECT:
> > +		if (!can_redirect) {
> > +			WARN_ONCE(1, "Illegal redirect return code in prog %s\n",
> > +				  lwt->name ? : "<unknown>");
> > +			ret = BPF_OK;
> > +		} else {
> > +			ret = skb_do_redirect(skb);
> 
> I think this assumes that program did bpf_skb_push and L2 header is present.
> Would it make sense to check that mac_header < network_header here to make
> sure that it actually happened? I think the cost of single 'if' isn't much.
> Also skb_do_redirect() can redirect to l3 tunnels like ipip ;)
> so program shouldn't be doing bpf_skb_push in such case...

We are currently guaranteeing mac_header <= network_header given that
bpf_skb_push() is calling skb_reset_mac_header() unconditionally.

Even if a program were to push an L2 header and then redirect to an l3
tunnel, __bpf_redirect_no_mac will pull it off again and correct the
mac_header offset.

Should we check in __bpf_redirect_common() whether mac_header <
nework_header then or add it to lwt-bpf conditional on
dev_is_mac_header_xmit()?

> May be rename bpf_skb_push to bpf_skb_push_l2 ?
> since it's doing skb_reset_mac_header(skb); at the end of it?
> Or it's probably better to use 'flags' argument to tell whether
> bpf_skb_push() should set mac_header or not ?

I added the flags for this purpose but left it unused for now. This
will allow to add a flag later to extend the l3 header prior to
pushing an l2 header, hence the current helper name. But even then,
we would always reset the mac header as well to ensure we never
leave an skb with mac_header > network_header. Are you seeing a
scenario where we would want to omit the mac header reset?

> Then this bit:
> 
> > +		case BPF_OK:
> > +			/* If the L3 header was expanded, headroom might be too
> > +			 * small for L2 header now, expand as needed.
> > +			 */
> > +			ret = xmit_check_hhlen(skb);
> 
> will work fine as well...
> which probably needs "mac_header wasn't set" check? or it's fine?

Right. The check is not strictly required right now but is a safety net
to ensure that the skb passed to dst_neigh_output() which will add the
l2 header in the non-redirect case to always have sufficient headroom.
It will currently be a NOP as we are not allowing to extend the l3 header
in bpf_skb_push() yet. I left this in there to ensure that we are not
missing to add this later.

^ permalink raw reply

* Re: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer
From: Nikita Yushchenko @ 2016-11-30  6:36 UTC (permalink / raw)
  To: Andy Duan, David S. Miller, Troy Kisky, Andrew Lunn, Eric Nelson,
	Philippe Reynes, Johannes Berg, netdev@vger.kernel.org
  Cc: Chris Healy, Fabio Estevam, linux-kernel@vger.kernel.org
In-Reply-To: <AM4PR0401MB2260D022EC9B9E4C4333E621FF8C0@AM4PR0401MB2260.eurprd04.prod.outlook.com>

> But I think it is not necessary since the driver don't support jumbo frame.

Hardcoded 1522 raises two separate issues.

(1) When DSA is in use, frames processed by FEC chip contain DSA tag and
thus can be larger than hardcoded limit of 1522. This issue is not
FEC-specific, any driver that hardcodes maximum frame size to 1522 (many
do) will have this issue if used with DSA.

Clean solution for this must take into account that difference between
MTU and max frame size is no longer known at compile time. Actually this
is the case even without DSA, due to VLANs: max frame size is (MTU + 18)
without VLANs, but (MTU + 22) with VLANs. However currently drivers tend
to ignore this and hardcode 22.  With DSA, 22 is not enough, need to add
switch-specific tag size to that.

Not yet sure how to handle this. DSA-specific API to find out tag size
could be added, but generic solution should handle all cases of dynamic
difference between MTU and max frame size, not only DSA.

(2) There is some demand to use larger frames for optimization purposes.

FEC register fields that limit frame size are 14-bit, thus allowing
frames up to (4k-1). I'm about to prepare a larger patch:
- add ndo_change_mtu handler, allowing MTU up to (4k - overhead),
- set MAX_FL / TRUNC_FL based on configured MTU,
- if necessary, do buffer reallocation with larger buffers.

Is this suitable for upstreaming?
Is there any policy related to handling larger frames?

^ permalink raw reply

* Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications
From: Alexei Starovoitov @ 2016-11-30  5:41 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, daniel, ast, daniel, maheshb, tgraf
In-Reply-To: <d3359d0c-8995-d605-cd2d-f0558fdd0408@cumulusnetworks.com>

On Tue, Nov 29, 2016 at 06:07:18PM -0700, David Ahern wrote:
> On 11/29/16 5:59 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 05:43:08PM -0700, David Ahern wrote:
> >> On 11/29/16 1:01 PM, Alexei Starovoitov wrote:
> >>> Could you also expose sk_protcol and sk_type as read only fields?
> >>
> >> Those are bitfields in struct sock, so can't use offsetof or sizeof. Any existing use cases that try to load a bitfield in a bpf that I can look at?
> > 
> > pkt_type, vlan are also bitfileds in skb. Please see convert_skb_access()
> > There is a bit of ugliness due to __BIG_ENDIAN_BITFIELD though..
> > 
> 
> Given the added complexity I'd prefer to defer this second use case to a follow on patch set. This one introduces the infra for sockets and I don't see anything needing to change with it to add the read of 3 more sock elements. Agree?

I don't see a complexity. It was straightforward for skb bitfields,
but if there is some unforeseen issue, it's better to tackle it now
otherwise the feature may never come and this 'infra for sockets' will
stay as 'infra for vrf only' and I'm struggling to convince myself that it's ok.
So I'll try to tweak this patch to add these 3 fields...

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Alexei Starovoitov @ 2016-11-30  5:37 UTC (permalink / raw)
  To: John Fastabend; +Cc: Thomas Graf, davem, netdev, daniel, tom, roopa, hannes
In-Reply-To: <583E3EF4.7030107@gmail.com>

On Tue, Nov 29, 2016 at 06:52:36PM -0800, John Fastabend wrote:
> On 16-11-29 04:15 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> >> Registers new BPF program types which correspond to the LWT hooks:
> >>   - BPF_PROG_TYPE_LWT_IN   => dst_input()
> >>   - BPF_PROG_TYPE_LWT_OUT  => dst_output()
> >>   - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()
> >>
> >> The separate program types are required to differentiate between the
> >> capabilities each LWT hook allows:
> >>
> >>  * Programs attached to dst_input() or dst_output() are restricted and
> >>    may only read the data of an skb. This prevent modification and
> >>    possible invalidation of already validated packet headers on receive
> >>    and the construction of illegal headers while the IP headers are
> >>    still being assembled.
> >>
> >>  * Programs attached to lwtunnel_xmit() are allowed to modify packet
> >>    content as well as prepending an L2 header via a newly introduced
> >>    helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
> >>    after the IP header has been assembled completely.
> >>
> >> All BPF programs receive an skb with L3 headers attached and may return
> >> one of the following error codes:
> >>
> >>  BPF_OK - Continue routing as per nexthop
> >>  BPF_DROP - Drop skb and return EPERM
> >>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
> >>                 (Only valid in lwtunnel_xmit() context)
> >>
> >> The return codes are binary compatible with their TC_ACT_
> >> relatives to ease compatibility.
> >>
> >> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> > ...
> >> +#define LWT_BPF_MAX_HEADROOM 128
> > 
> > why 128?
> > btw I'm thinking for XDP to use 256, so metadata can be stored in there.
> > 
> 
> hopefully not too off-topic but for XDP I would like to see this get

definitely off-topic. lwt->headroom is existing concept. Too late
to do anything about it.

> passed down with the program. It would be more generic and drivers could
> configure the headroom on demand and more importantly verify that a
> program pushing data is not going to fail at runtime.

For xdp I think it will be problematic, since we'd have to check for
that value at prog array access to make sure tailcalls are not broken.
Mix and match won't be possible.
So what does 'configure the headroom on demand' buys us?
Isn't it much easier to tell all drivers "always reserve this much" ?
We burn the page anyway.
If it's configurable per driver, then we'd need an api
to retrieve it. Yet the program author doesn't care what the value is.
If program needs to do udp encap, it will try do it. No matter what.
If xdp_adjust_head() helper fails, the program will likely decide
to drop the packet. In some cases it may decide to punt to stack
for further processing, but for high performance dataplane code
it's highly unlikely.
If it's configurable to something that is not cache line boundary
hw dma performance may suffer and so on.
So I see only cons in such 'configurable headroom' and propose
to have fixed 256 bytes headroom for XDP
which is enough for any sensible encap and metadata.

^ permalink raw reply

* Re: The ubufs->refcount maybe be subtracted twice when tun_get_user failed
From: Jason Wang @ 2016-11-30  5:24 UTC (permalink / raw)
  To: wangyunjian, mst@redhat.com, netdev@vger.kernel.org; +Cc: caihe
In-Reply-To: <fad5d47a-96ee-1afc-b10c-cb2a6d5dc784@redhat.com>



On 2016年11月30日 10:53, Jason Wang wrote:
>
>
> On 2016年11月29日 21:27, wangyunjian wrote:
>> Sorry, I didn't describe it clearly. In fact, the second subtraction 
>> happens in the function handle_tx,
>> when tun_get_user fails and zcopy_used is ture. Fllowing the steps:
>
> I get your meaning. Thanks for the reporting. Will post patches (since 
> macvtap has similar issue).

Posted, appreciate if you can have a test on them.

Thanks

^ permalink raw reply

* Re: [net-next PATCH v3 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers
From: Alexei Starovoitov @ 2016-11-30  5:23 UTC (permalink / raw)
  To: John Fastabend
  Cc: Michael S. Tsirkin, eric.dumazet, daniel, shm, davem, tgraf,
	john.r.fastabend, netdev, bblanco, brouer
In-Reply-To: <583E3E6A.8050706@gmail.com>

On Tue, Nov 29, 2016 at 06:50:18PM -0800, John Fastabend wrote:
> On 16-11-29 04:37 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 12:11:33PM -0800, John Fastabend wrote:
> >> virtio_net XDP support expects receive buffers to be contiguous.
> >> If this is not the case we enable a slowpath to allow connectivity
> >> to continue but at a significan performance overhead associated with
> >> linearizing data. To make it painfully aware to users that XDP is
> >> running in a degraded mode we throw an xdp buffer error.
> >>
> >> To linearize packets we allocate a page and copy the segments of
> >> the data, including the header, into it. After this the page can be
> >> handled by XDP code flow as normal.
> >>
> >> Then depending on the return code the page is either freed or sent
> >> to the XDP xmit path. There is no attempt to optimize this path.
> >>
> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> > ...
> >> +/* The conditions to enable XDP should preclude the underlying device from
> >> + * sending packets across multiple buffers (num_buf > 1). However per spec
> >> + * it does not appear to be illegal to do so but rather just against convention.
> >> + * So in order to avoid making a system unresponsive the packets are pushed
> >> + * into a page and the XDP program is run. This will be extremely slow and we
> >> + * push a warning to the user to fix this as soon as possible. Fixing this may
> >> + * require resolving the underlying hardware to determine why multiple buffers
> >> + * are being received or simply loading the XDP program in the ingress stack
> >> + * after the skb is built because there is no advantage to running it here
> >> + * anymore.
> >> + */
> > ...
> >>  		if (num_buf > 1) {
> >>  			bpf_warn_invalid_xdp_buffer();
> 			^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Here is the warn once call. I made it a defined bpf warning so that we
> can easily identify if needed and it can be used by other drivers as
> well.

ahh. I thought it has - in front of it :)
and you're deleting it in this patch.

> >> -			goto err_xdp;
> >> +
> >> +			/* linearize data for XDP */
> >> +			xdp_page = xdp_linearize_page(rq, num_buf,
> >> +						      page, offset, &len);
> >> +			if (!xdp_page)
> >> +				goto err_xdp;
> > 
> > in case when we're 'lucky' the performance will silently be bad.
> 
> Were you specifically worried about the alloc failing here and no
> indication? I was thinking a statistic counter could be added as a
> follow on series to catch this and other such cases in non XDP paths
> if needed.
> 
> > Can we do warn_once here? so at least something in dmesg points out
> > that performance is not as expected. Am I reading it correctly that
> > you had to do a special kernel hack to trigger this situation and
> > in all normal cases it's not the case?
> > 
> 
> Correct the only way to produce this with upstream qemu and Linux is to
> write a kernel hack to build these types of buffers. AFAIK I caught all
> the cases where it could happen otherwise in the setup xdp prog call and
> required user to configure device driver so they can not happen e.g.
> disable LRO, set MTU < PAGE_SIZE, etc.
> 
> If I missed anything, I don't see it now, I would call it a bug and fix
> it. Again AFAIK everything should be covered though. As Micheal pointed
> out earlier although unexpected its not strictly against virtio spec to
> build such packets so we should handle it at least in a degraded mode
> even though it is not expected in any qemu cases.

Ok. Makes sense. The above wasn't clear in the commit log.
Thanks

^ permalink raw reply

* [PATCH net 2/2] macvtap: handle ubuf refcount correctly when meet erros
From: Jason Wang @ 2016-11-30  5:17 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, wangyunjian, Jason Wang
In-Reply-To: <1480483072-14201-1-git-send-email-jasowang@redhat.com>

We trigger uarg->callback() immediately after we decide do datacopy
even if caller want to do zerocopy. This will cause the callback
(vhost_net_zerocopy_callback) decrease the refcount. But when we meet
an error afterwards, the error handling in vhost handle_tx() will try
to decrease it again. This is wrong and fix this by delay the
uarg->callback() until we're sure there's no errors.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
The patch is needed for -stable.
---
 drivers/net/macvtap.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index bceca28..7869b06 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -742,13 +742,8 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 
 	if (zerocopy)
 		err = zerocopy_sg_from_iter(skb, from);
-	else {
+	else
 		err = skb_copy_datagram_from_iter(skb, 0, from, len);
-		if (!err && m && m->msg_control) {
-			struct ubuf_info *uarg = m->msg_control;
-			uarg->callback(uarg, false);
-		}
-	}
 
 	if (err)
 		goto err_kfree;
@@ -779,7 +774,11 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 		skb_shinfo(skb)->destructor_arg = m->msg_control;
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+	} else if (m && m->msg_control) {
+		struct ubuf_info *uarg = m->msg_control;
+		uarg->callback(uarg, false);
 	}
+
 	if (vlan) {
 		skb->dev = vlan->dev;
 		dev_queue_xmit(skb);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/2] tun: handle ubuf refcount correctly when meet erros
From: Jason Wang @ 2016-11-30  5:17 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, wangyunjian, Jason Wang

We trigger uarg->callback() immediately after we decide do datacopy
even if caller want to do zerocopy. This will cause the callback
(vhost_net_zerocopy_callback) decrease the refcount. But when we meet
an error afterwards, the error handling in vhost handle_tx() will try
to decrease it again. This is wrong and fix this by delay the
uarg->callback() until we're sure there's no errors.

Reported-by: wangyunjian <wangyunjian@huawei.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
The patch is needed for -stable.
---
 drivers/net/tun.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8093e39..db6acec 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1246,13 +1246,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	if (zerocopy)
 		err = zerocopy_sg_from_iter(skb, from);
-	else {
+	else
 		err = skb_copy_datagram_from_iter(skb, 0, from, len);
-		if (!err && msg_control) {
-			struct ubuf_info *uarg = msg_control;
-			uarg->callback(uarg, false);
-		}
-	}
 
 	if (err) {
 		this_cpu_inc(tun->pcpu_stats->rx_dropped);
@@ -1298,6 +1293,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb_shinfo(skb)->destructor_arg = msg_control;
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+	} else if (msg_control) {
+		struct ubuf_info *uarg = msg_control;
+		uarg->callback(uarg, false);
 	}
 
 	skb_reset_network_header(skb);
-- 
2.7.4

^ permalink raw reply related

* Re: netlink: GPF in sock_sndtimeo
From: Richard Guy Briggs @ 2016-11-30  4:52 UTC (permalink / raw)
  To: Cong Wang
  Cc: Dmitry Vyukov, David Miller, Johannes Berg, Florian Westphal,
	Eric Dumazet, Herbert Xu, netdev, LKML, syzkaller
In-Reply-To: <CAM_iQpX_KBEpL1dCBNMgZQ-VQJM9nmsVdRPVK-Aen9k0gV3f0w@mail.gmail.com>

On 2016-11-29 15:13, Cong Wang wrote:
> On Tue, Nov 29, 2016 at 8:48 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2016-11-26 17:11, Cong Wang wrote:
> >> It is racy on audit_sock, especially on the netns exit path.
> >
> > I think that is the only place it is racy.  The other places audit_sock
> > is set is when the socket failure has just triggered a reset.
> >
> > Is there a notifier callback for failed or reaped sockets?
> 
> Is NETLINK_URELEASE event what you are looking for?

Possibly, yes.  Thanks, I'll have a look.

- RGB

^ permalink raw reply

* Re: [PATCH] net: macb: Write only necessary bits in NCR in macb reset
From: Harini Katakam @ 2016-11-30  4:17 UTC (permalink / raw)
  To: David Miller
  Cc: Harini Katakam, Nicolas Ferre, netdev,
	linux-kernel@vger.kernel.org, michals@xilinx.com
In-Reply-To: <20161129.190526.1414678491662084583.davem@davemloft.net>

Hi David,

On Wed, Nov 30, 2016 at 5:35 AM, David Miller <davem@davemloft.net> wrote:
> From: Harini Katakam <harini.katakam@xilinx.com>
> Date: Mon, 28 Nov 2016 14:53:49 +0530
>
>> In macb_reset_hw, use read-modify-write to disable RX and TX.
>> This way exiting settings and reserved bits wont be disturbed.
>> Use the same method for clearing statistics as well.
>>
>> Signed-off-by: Harini Katakam <harinik@xilinx.com>
>
> This doesn't make much sense to me.
>
> Consider the two callers of this function.
>
> macb_init_hw() is going to do a non-masking write to the NCR
> register:
>
>         /* Enable TX and RX */
>         macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
>
> So obviously no other writable fields matter at all for programming
> the chip properly, otherwise macb_init_hw() would "or" in the bits
> after a read of NCR.  But that's not what this code does, it
> writes "RE | TE | MPE" directly.
>
> And the other caller is macb_close() which is shutting down the
> chip so can zero out all the other bits and it can't possibly
> matter, also due to the assertion above about macb_init_hw()
> showing that only the RE, TE, and MPE bits matter for proper
> functioning of the chip.
>
> You haven't shown a issue caused by the way the code works now, so
> this patch isn't fixing a bug.  In fact, the "bit preserving" would
> even be misleading to someone reading the code.  They will ask
> themselves what bits need to be preserved, and as shown above none of
> them need to be.
>
> I'm not applying this, sorry.
>

Thanks for the detailed analysis.
I noticed an issue with respect to management port enable bit
when working on the patch series for macb mdio driver for
multi MAC - multi PHY access via same mdio bus.
MPE bit of the MAC (whose phy management register
is used) was being reset. But that is a separate
series still in review.
You are right, there is no existing bug that this
patch addresses. I will come back to this later if
required.

Regards,
Harini

^ permalink raw reply

* Re: [PATCH net 00/16] net: fix fixed-link phydev leaks
From: David Miller @ 2016-11-30  4:17 UTC (permalink / raw)
  To: johan
  Cc: vbridger, f.fainelli, fugang.duan, pantelis.antoniou, vbordug,
	claudiu.manoil, leoli, thomas.petazzoni, nbd, blogic,
	matthias.bgg, sergei.shtylyov, lars.persson, mugunthanvnm,
	grygorii.strashko, robh+dt, frowand.list, andrew, vivien.didelot,
	netdev, nios2-dev, linux-kernel, linuxppc-dev, linux-mediatek,
	linux-renesas-soc, linux-omap, devicetree
In-Reply-To: <1480357509-28074-1-git-send-email-johan@kernel.org>

From: Johan Hovold <johan@kernel.org>
Date: Mon, 28 Nov 2016 19:24:53 +0100

> This series fixes failures to deregister and free fixed-link phydevs
> that have been registered using the of_phy_register_fixed_link()
> interface.
> 
> All but two drivers currently fail to do this and this series fixes most
> of them with the exception of a staging driver and the stmmac drivers
> which will be fixed by follow-on patches.
> 
> Included are also a couple of fixes for related of-node leaks.
> 
> Note that all patches except the of_mdio one have been compile-tested
> only.
> 
> Also note that the series is against net due to dependencies not yet in
> net-next.

Series applied, thanks Johan.

^ permalink raw reply

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
From: Tom Herbert @ 2016-11-30  4:04 UTC (permalink / raw)
  To: David Lebrun; +Cc: Linux Kernel Network Developers
In-Reply-To: <1480439718-18019-1-git-send-email-david.lebrun@uclouvain.be>

On Tue, Nov 29, 2016 at 9:15 AM, David Lebrun <david.lebrun@uclouvain.be> wrote:
> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
>
This is a lot of code to make ECMP work better. Can you be more
specific as to what the "issues" are? Assuming this is just the
transient packet reorder that happens in one link flap I am wondering
if this complexity is justified.

Tom

> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
>
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
>
> The number of slices for a route also fixes the maximum number of nexthops
> possible for that route.
>
> Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
> ---
>  include/net/ip6_fib.h    |  25 +++++++
>  include/net/netns/ipv6.h |   1 +
>  net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
>  net/ipv6/route.c         |  58 ++++++++--------
>  4 files changed, 229 insertions(+), 29 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index a74e2aa..29594cc 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -93,6 +93,30 @@ struct rt6key {
>
>  struct fib6_table;
>
> +/* Multipath nexthop.
> + * @nh is the actual nexthop
> + * @nslices is the number of slices assigned to this nexthop
> + */
> +struct rt6_multi_nh {
> +       struct rt6_info *nh;
> +       unsigned int    nslices;
> +};
> +
> +/* Multipath routes map.
> + * @nhs is an array of available nexthops
> + * @size is the number of elements in @nhs
> + * @nslices is the number of slices and the number of elements in @index
> + *         and is always in the form 2^x to prevent using a division for
> + *         the computation of the modulo.
> + * @index is an array mapping a slice index to a nexthop index in @nhs
> + */
> +struct rt6_multi_map {
> +       struct rt6_multi_nh     *nhs;
> +       unsigned int            size;
> +       unsigned int            nslices;
> +       __u8                    *index;
> +};
> +
>  struct rt6_info {
>         struct dst_entry                dst;
>
> @@ -113,6 +137,7 @@ struct rt6_info {
>          */
>         struct list_head                rt6i_siblings;
>         unsigned int                    rt6i_nsiblings;
> +       struct rt6_multi_map            *rt6i_nh_map;
>
>         atomic_t                        rt6i_ref;
>
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index de7745e..4967846 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
>         int idgen_retries;
>         int idgen_delay;
>         int flowlabel_state_ranges;
> +       int ip6_rt_ecmp_slices;
>  };
>
>  struct netns_ipv6 {
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index ef54852..b6b1895 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
>         }
>  }
>
> +static void fib6_mp_free(struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
> +       struct rt6_info *sibling;
> +
> +       if (nh_map) {
> +               list_for_each_entry(sibling, &rt->rt6i_siblings,
> +                                   rt6i_siblings) {
> +                       sibling->rt6i_nh_map = NULL;
> +               }
> +
> +               rt->rt6i_nh_map = NULL;
> +
> +               kfree(nh_map->nhs);
> +               kfree(nh_map->index);
> +               kfree(nh_map);
> +       }
> +}
> +
> +static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
> +                         struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> +       struct rt6_multi_nh *tmp_nhs, *old_nhs;
> +       unsigned int kslices;
> +       unsigned int i, j;
> +
> +       if (!nh_map) {
> +               __u8 *index;
> +
> +               nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
> +               if (!nh_map)
> +                       return -ENOMEM;
> +
> +               nh_map->size = 1;
> +               nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
> +
> +               tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
> +               if (!tmp_nhs) {
> +                       kfree(nh_map);
> +                       return -ENOMEM;
> +               }
> +
> +               tmp_nhs[0].nh = sref;
> +               tmp_nhs[0].nslices = nh_map->nslices;
> +               nh_map->nhs = tmp_nhs;
> +
> +               index = kmalloc_array(nh_map->nslices, sizeof(*index),
> +                                     GFP_ATOMIC);
> +               if (!index) {
> +                       kfree(nh_map->nhs);
> +                       kfree(nh_map);
> +                       return -ENOMEM;
> +               }
> +
> +               for (i = 0; i < nh_map->nslices; i++)
> +                       index[i] = 0;
> +
> +               nh_map->index = index;
> +               sref->rt6i_nh_map = nh_map;
> +       }
> +
> +       if (nh_map->size == nh_map->nslices)
> +               return -ENOBUFS;
> +
> +       nh_map->size++;
> +
> +       tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
> +       if (!tmp_nhs)
> +               return -ENOMEM;
> +
> +       for (i = 0; i < nh_map->size - 1; i++)
> +               tmp_nhs[i] = nh_map->nhs[i];
> +
> +       tmp_nhs[nh_map->size - 1].nh = rt;
> +       tmp_nhs[nh_map->size - 1].nslices = 0;
> +
> +       kslices = nh_map->nslices / nh_map->size;
> +
> +       /* Loop over nexthops and steal a random slice until we have kslices for
> +        * the new nexthop. This algorithm ensures that flows are taken as
> +        * equally as possible from current nexthops.
> +        *
> +        * At each iteration, @j is the index in tmp_nhs of the nexthop
> +        * to steal from and @idx is the index of the slice to steal
> +        * among @j's slices.
> +        */
> +       for (i = 0, j = 0; i < kslices; i++) {
> +               unsigned int idx, k = 0;
> +
> +               if (tmp_nhs[j].nslices == 1)
> +                       continue;
> +
> +               idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
> +               do {
> +                       if (nh_map->index[k++] == j)
> +                               idx--;
> +               } while (idx);
> +
> +               nh_map->index[k - 1] = nh_map->size - 1;
> +               tmp_nhs[nh_map->size - 1].nslices++;
> +               tmp_nhs[j].nslices--;
> +
> +               j = (j + 1) % (nh_map->size - 1);
> +       }
> +
> +       WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
> +
> +       old_nhs = nh_map->nhs;
> +       nh_map->nhs = tmp_nhs;
> +       kfree(old_nhs);
> +
> +       rt->rt6i_nh_map = nh_map;
> +
> +       return 0;
> +}
> +
> +static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> +       struct rt6_multi_nh *tmp_nhs, *old_nhs;
> +       unsigned int i, j, idx = 0;
> +
> +       tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
> +       if (!tmp_nhs)
> +               return -ENOMEM;
> +
> +       for (i = 0, j = 0; i < nh_map->size; i++) {
> +               if (nh_map->nhs[i].nh != rt)
> +                       tmp_nhs[j++] = nh_map->nhs[i];
> +               else
> +                       idx = i;
> +       }
> +
> +       nh_map->size--;
> +       old_nhs = nh_map->nhs;
> +       nh_map->nhs = tmp_nhs;
> +       kfree(old_nhs);
> +
> +       rt->rt6i_nh_map = NULL;
> +
> +       /* Update the slices index. For each slice mapping to the removed
> +        * nexthop, assign another random nexthop. For each slice mapping
> +        * to a nexthop that was after the removed nexthop, decrement the
> +        * nexthop index to reflect the shift. Nexthops that were before
> +        * the removed nexthop are unaffected.
> +        */
> +       for (i = 0; i < nh_map->nslices; i++) {
> +               if (nh_map->index[i] == idx) {
> +                       j = prandom_u32() % nh_map->size;
> +                       nh_map->index[i] = j;
> +                       nh_map->nhs[j].nslices++;
> +               } else if (nh_map->index[i] > idx) {
> +                       nh_map->index[i]--;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  /*
>   *     Insert routing information in a node.
>   */
> @@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
>                         }
>                         sibling = sibling->dst.rt6_next;
>                 }
> +               err = fib6_mp_extend(info->nl_net, sibling, rt);
> +               if (err < 0)
> +                       return err;
>                 /* For each sibling in the list, increment the counter of
>                  * siblings. BUG() if counters does not match, list of siblings
>                  * is broken!
> @@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
>                         fn->fn_flags |= RTN_RTINFO;
>                 }
>                 nsiblings = iter->rt6i_nsiblings;
> +               if (nsiblings)
> +                       fib6_mp_free(iter);
>                 fib6_purge_rt(iter, fn, info->nl_net);
>                 rt6_release(iter);
>
> @@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
>
>         /* Remove this entry from other siblings */
>         if (rt->rt6i_nsiblings) {
> -               struct rt6_info *sibling, *next_sibling;
> +               struct rt6_info *sibling, *next_sibling, *sref;
> +
> +               sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
> +                                       rt6i_siblings);
>
>                 list_for_each_entry_safe(sibling, next_sibling,
>                                          &rt->rt6i_siblings, rt6i_siblings)
>                         sibling->rt6i_nsiblings--;
>                 rt->rt6i_nsiblings = 0;
>                 list_del_init(&rt->rt6i_siblings);
> +               if (!sref->rt6i_nsiblings)
> +                       fib6_mp_free(sref);
> +               else
> +                       fib6_mp_shrink(sref, rt);
>         }
>
>         /* Adjust walkers */
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index b317bb1..e9f13e0 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
>         return false;
>  }
>
> -/* Multipath route selection:
> - *   Hash based function using packet header and flowlabel.
> - * Adapted from fib_info_hashfn()
> - */
> -static int rt6_info_hash_nhsfn(unsigned int candidate_count,
> -                              const struct flowi6 *fl6)
> -{
> -       return get_hash_from_flowi6(fl6) % candidate_count;
> -}
> -
>  static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
>                                              struct flowi6 *fl6, int oif,
>                                              int strict)
>  {
> -       struct rt6_info *sibling, *next_sibling;
> -       int route_choosen;
> +       struct rt6_multi_map *nh_map = match->rt6i_nh_map;
> +       __u32 hash = get_hash_from_flowi6(fl6);
> +       unsigned int slice, idx;
> +       struct rt6_info *res;
>
> -       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
> -       /* Don't change the route, if route_choosen == 0
> -        * (siblings does not include ourself)
> -        */
> -       if (route_choosen)
> -               list_for_each_entry_safe(sibling, next_sibling,
> -                               &match->rt6i_siblings, rt6i_siblings) {
> -                       route_choosen--;
> -                       if (route_choosen == 0) {
> -                               if (rt6_score_route(sibling, oif, strict) < 0)
> -                                       break;
> -                               match = sibling;
> -                               break;
> -                       }
> -               }
> -       return match;
> +       WARN_ON(!nh_map);
> +       if (!nh_map)
> +               return match;
> +
> +       slice = hash & (nh_map->nslices - 1);
> +       idx = nh_map->index[slice];
> +       res = nh_map->nhs[idx].nh;
> +
> +       if (rt6_score_route(res, oif, strict) < 0)
> +               res = match;
> +
> +       return res;
>  }
>
>  /*
> @@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
>         return 0;
>  }
>
> +static int one = 1;
> +static int thirtyone = 31;
> +
>  struct ctl_table ipv6_route_table_template[] = {
>         {
>                 .procname       =       "flush",
> @@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
>                 .mode           =       0644,
>                 .proc_handler   =       proc_dointvec_ms_jiffies,
>         },
> +       {
> +               .procname       =       "ecmp_slices",
> +               .data           =       &init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
> +               .maxlen         =       sizeof(int),
> +               .mode           =       0644,
> +               .proc_handler   =       proc_dointvec_minmax,
> +               .extra1         =       &one,
> +               .extra2         =       &thirtyone,
> +       },
>         { }
>  };
>
> @@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
>                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
>                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
>                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
> +               table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
>
>                 /* Don't export sysctls to unprivileged users */
>                 if (net->user_ns != &init_user_ns)
> @@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
>         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
>         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
>         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
> +       net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
>
>         net->ipv6.ip6_rt_gc_expire = 30*HZ;
>
> --
> 2.7.3
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox