Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 03/11] vxlan: Allow configuration of DF behaviour
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.

For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.

According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.

Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c          | 29 +++++++++++++++++++++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  9 +++++++++
 3 files changed, 39 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7706e392b2a7..ccb19b833706 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2289,6 +2289,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			df = htons(IP_DF);
 		}
 
+		if (!df) {
+			if (vxlan->cfg.df == VXLAN_DF_SET) {
+				df = htons(IP_DF);
+			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
+				struct ethhdr *eth = eth_hdr(skb);
+
+				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
+				    (ntohs(eth->h_proto) == ETH_P_IP &&
+				     old_iph->frag_off & htons(IP_DF)))
+					df = htons(IP_DF);
+			}
+		}
+
 		ndst = &rt->dst;
 		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
 
@@ -2837,6 +2850,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -2893,6 +2907,16 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
+		if (df < 0 || df > VXLAN_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -3538,6 +3562,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
 	}
 
+	if (data[IFLA_VXLAN_DF])
+		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
 	return 0;
 }
 
@@ -3630,6 +3657,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -3696,6 +3724,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 03431c148e16..ec999c49df1f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -216,6 +216,7 @@ struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	enum ifla_vxlan_df	df;
 };
 
 struct vxlan_dev_node {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1debfa42cba1..efc588949431 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -533,6 +533,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -542,6 +543,14 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 04/11] selftests: pmtu: Introduce tests for IPv4/IPv6 over VxLAN over IPv6
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Use a router between endpoints, implemented via namespaces, set a low MTU
between router and destination endpoint, exceed it and check PMTU value in
route exceptions.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
This only introduces tests over VxLAN over IPv6 right now. I'll introduce
tests over IPv4 (they can be added trivially) once DF configuration support
is accepted into iproute2.

 tools/testing/selftests/net/pmtu.sh | 115 +++++++++++++++++++++++-----
 1 file changed, 97 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index a369d616b390..19ede74af560 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -26,6 +26,17 @@
 # - pmtu_ipv6
 #	Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
 #
+# - pmtu_ipv4_vxlan6_exception
+#	Set up the same network topology as pmtu_ipv4, create a VxLAN tunnel
+#	over IPv6 between A and B, routed via R1. On the link between R1 and B,
+#	set a MTU lower than the VxLAN MTU and the MTU on the link between A and
+#	R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VxLAN
+#	from A to B and check that the PMTU exception is created with the right
+#	value on A
+#
+# - pmtu_ipv6_vxlan6_exception
+#	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
+#
 # - pmtu_vti4_exception
 #	Set up vti tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is not
@@ -72,6 +83,8 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
 tests="
 	pmtu_ipv4_exception		ipv4: PMTU exceptions
 	pmtu_ipv6_exception		ipv6: PMTU exceptions
+	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions
+	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
 	pmtu_vti6_exception		vti6: PMTU exceptions
 	pmtu_vti4_exception		vti4: PMTU exceptions
 	pmtu_vti4_default_mtu		vti4: default MTU assignment
@@ -95,8 +108,8 @@ ns_r2="ip netns exec ${NS_R2}"
 # Addresses are:
 # - IPv4: PREFIX4.SEGMENT.ID (/24)
 # - IPv6: PREFIX6:SEGMENT::ID (/64)
-prefix4="192.168"
-prefix6="fd00"
+prefix4="10.0"
+prefix6="fc00"
 a_r1=1
 a_r2=2
 b_r1=3
@@ -129,12 +142,12 @@ veth6_a_addr="fd00:1::a"
 veth6_b_addr="fd00:1::b"
 veth6_mask="64"
 
-vti4_a_addr="192.168.2.1"
-vti4_b_addr="192.168.2.2"
-vti4_mask="24"
-vti6_a_addr="fd00:2::a"
-vti6_b_addr="fd00:2::b"
-vti6_mask="64"
+tunnel4_a_addr="192.168.2.1"
+tunnel4_b_addr="192.168.2.2"
+tunnel4_mask="24"
+tunnel6_a_addr="fd00:2::a"
+tunnel6_b_addr="fd00:2::b"
+tunnel6_mask="64"
 
 dummy6_0_addr="fc00:1000::0"
 dummy6_1_addr="fc00:1001::0"
@@ -202,11 +215,34 @@ setup_vti() {
 }
 
 setup_vti4() {
-	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${vti4_a_addr} ${vti4_b_addr} ${vti4_mask}
+	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask}
 }
 
 setup_vti6() {
-	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${vti6_a_addr} ${vti6_b_addr} ${vti6_mask}
+	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
+}
+
+setup_vxlan() {
+	a_addr="${1}"
+	b_addr="${2}"
+
+	${ns_a} ip link add vxlan_a type vxlan id 1 local ${a_addr} remote ${b_addr} ttl 64 dstport 4789 || return 1
+	${ns_b} ip link add vxlan_b type vxlan id 1 local ${b_addr} remote ${a_addr} ttl 64 dstport 4789
+
+	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask}   dev vxlan_a
+	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask}   dev vxlan_b
+
+	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask}   dev vxlan_a
+	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask}   dev vxlan_b
+
+	${ns_a} ip link set vxlan_a up
+	${ns_b} ip link set vxlan_b up
+
+	sleep 1
+}
+
+setup_vxlan6() {
+	setup_vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
 }
 
 setup_xfrm() {
@@ -465,6 +501,49 @@ test_pmtu_ipv6_exception() {
 	test_pmtu_ipvX 6
 }
 
+test_pmtu_ipvX_over_vxlan6_exception() {
+	family=${1}
+	ll_mtu=4000
+
+	setup namespaces routing vxlan6 || return 2
+	#                      IPv6 header   UDP header   VxLAN header   Ethernet header
+	exp_mtu=$((${ll_mtu} - 40          - 8          - 8            - 14))
+
+	trace "${ns_a}" vxlan_a      "${ns_b}"  vxlan_b \
+	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
+
+	if [ ${family} -eq 4 ]; then
+		ping=ping
+		dst=${tunnel4_b_addr}
+	else
+		ping=${ping6}
+		dst=${tunnel6_b_addr}
+	fi
+
+	# Create route exception by exceeding link layer MTU
+	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
+	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
+	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+	mtu "${ns_a}" vxlan_a $((${ll_mtu} + 1000))
+	mtu "${ns_b}" vxlan_b $((${ll_mtu} + 1000))
+	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
+
+	# Check that exception was created
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on VxLAN interface"
+}
+
+test_pmtu_ipv4_vxlan6_exception() {
+	test_pmtu_ipvX_over_vxlan6_exception 4
+}
+
+test_pmtu_ipv6_vxlan6_exception() {
+	test_pmtu_ipvX_over_vxlan6_exception 6
+}
+
 test_pmtu_vti4_exception() {
 	setup namespaces veth vti4 xfrm4 || return 2
 	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
@@ -484,14 +563,14 @@ test_pmtu_vti4_exception() {
 
 	# Send DF packet without exceeding link layer MTU, check that no
 	# exception is created
-	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${vti4_b_addr} > /dev/null
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
 	check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1
 
 	# Now exceed link layer MTU by one byte, check that exception is created
 	# with the right PMTU value
-	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${vti4_b_addr} > /dev/null
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
 	check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))"
 }
 
@@ -506,20 +585,20 @@ test_pmtu_vti6_exception() {
 	mtu "${ns_b}" veth_b 4000
 	mtu "${ns_a}" vti6_a 5000
 	mtu "${ns_b}" vti6_b 5000
-	${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null
+	${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null
 
 	# Check that exception was created
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1
 
 	# Decrease tunnel MTU, check for PMTU decrease in route exception
 	mtu "${ns_a}" vti6_a 3000
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1
 
 	# Increase tunnel MTU, check for PMTU increase in route exception
 	mtu "${ns_a}" vti6_a 9000
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1
 
 	return ${fail}
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 05/11] geneve: ICMP error lookup handler
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Export an encap_err_lookup() operation to match an ICMP error against a
valid VNI.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/geneve.c | 52 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a0cd1c41cf5f..8a69879d516a 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -387,6 +387,57 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+/* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */
+static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+	struct genevehdr *geneveh;
+	struct geneve_sock *gs;
+	u8 zero_vni[3] = { 0 };
+	u8 *vni = zero_vni;
+
+	if (skb->len < GENEVE_BASE_HLEN)
+		return -EINVAL;
+
+	geneveh = geneve_hdr(skb);
+	if (geneveh->ver != GENEVE_VER)
+		return -EINVAL;
+
+	if (geneveh->proto_type != htons(ETH_P_TEB))
+		return -EINVAL;
+
+	gs = rcu_dereference_sk_user_data(sk);
+	if (!gs)
+		return -ENOENT;
+
+	if (geneve_get_sk_family(gs) == AF_INET) {
+		struct iphdr *iph = ip_hdr(skb);
+		__be32 addr4 = 0;
+
+		if (!gs->collect_md) {
+			vni = geneve_hdr(skb)->vni;
+			addr4 = iph->daddr;
+		}
+
+		return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT;
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (geneve_get_sk_family(gs) == AF_INET6) {
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		struct in6_addr addr6 = { 0 };
+
+		if (!gs->collect_md) {
+			vni = geneve_hdr(skb)->vni;
+			addr6 = ip6h->daddr;
+		}
+
+		return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT;
+	}
+#endif
+
+	return -EPFNOSUPPORT;
+}
+
 static struct socket *geneve_create_sock(struct net *net, bool ipv6,
 					 __be16 port, bool ipv6_rx_csum)
 {
@@ -544,6 +595,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	tunnel_cfg.gro_receive = geneve_gro_receive;
 	tunnel_cfg.gro_complete = geneve_gro_complete;
 	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+	tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup;
 	tunnel_cfg.encap_destroy = NULL;
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 	list_add(&gs->list, &gn->sock_list);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 06/11] geneve: Allow configuration of DF behaviour
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

draft-ietf-nvo3-geneve-08 says:

   It is strongly RECOMMENDED that Path MTU Discovery ([RFC1191],
   [RFC1981]) be used by setting the DF bit in the IP header when Geneve
   packets are transmitted over IPv4 (this is the default with IPv6).

Now that ICMP error handling is working for GENEVE, we can comply with
this recommendation.

Make this configurable, though, to avoid breaking existing setups. By
default, DF won't be set. It can be set or inherited from inner IPv4
packets. If it's configured to be inherited and we are encapsulating IPv6,
it will be set.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/geneve.c         | 52 +++++++++++++++++++++++++++++++-----
 include/uapi/linux/if_link.h |  9 +++++++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 8a69879d516a..cafdee06b5c8 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -70,6 +70,7 @@ struct geneve_dev {
 	bool		   collect_md;
 	bool		   use_udp6_rx_checksums;
 	bool		   ttl_inherit;
+	enum ifla_geneve_df df;
 };
 
 struct geneve_sock {
@@ -898,7 +899,24 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			ttl = key->ttl;
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 	}
+
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+	if (!df) {
+		if (geneve->df == GENEVE_DF_SET) {
+			df = htons(IP_DF);
+		} else if (geneve->df == GENEVE_DF_INHERIT) {
+			struct ethhdr *eth = eth_hdr(skb);
+
+			if (ntohs(eth->h_proto) == ETH_P_IPV6) {
+				df = htons(IP_DF);
+			} else if (ntohs(eth->h_proto) == ETH_P_IP) {
+				struct iphdr *iph = ip_hdr(skb);
+
+				if (iph->frag_off & htons(IP_DF))
+					df = htons(IP_DF);
+			}
+		}
+	}
 
 	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
 	if (unlikely(err))
@@ -1145,6 +1163,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_TTL_INHERIT]	= { .type = NLA_U8 },
+	[IFLA_GENEVE_DF]		= { .type = NLA_U8 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1180,6 +1199,16 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_GENEVE_DF]) {
+		enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
+		if (df < 0 || df > GENEVE_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_GENEVE_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1225,7 +1254,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 			    struct netlink_ext_ack *extack,
 			    const struct ip_tunnel_info *info,
 			    bool metadata, bool ipv6_rx_csum,
-			    bool ttl_inherit)
+			    bool ttl_inherit, enum ifla_geneve_df df)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1275,6 +1304,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
 	geneve->ttl_inherit = ttl_inherit;
+	geneve->df = df;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1294,7 +1324,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack,
 			  struct ip_tunnel_info *info, bool *metadata,
 			  bool *use_udp6_rx_checksums, bool *ttl_inherit,
-			  bool changelink)
+			  enum ifla_geneve_df *df, bool changelink)
 {
 	int attrtype;
 
@@ -1382,6 +1412,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_GENEVE_TOS])
 		info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_DF])
+		*df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
 	if (data[IFLA_GENEVE_LABEL]) {
 		info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
 				  IPV6_FLOWLABEL_MASK;
@@ -1500,6 +1533,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
+	enum ifla_geneve_df df = GENEVE_DF_UNSET;
 	bool use_udp6_rx_checksums = false;
 	struct ip_tunnel_info info;
 	bool ttl_inherit = false;
@@ -1508,12 +1542,12 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 
 	init_tnl_info(&info, GENEVE_UDP_PORT);
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, false);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, false);
 	if (err)
 		return err;
 
 	err = geneve_configure(net, dev, extack, &info, metadata,
-			       use_udp6_rx_checksums, ttl_inherit);
+			       use_udp6_rx_checksums, ttl_inherit, df);
 	if (err)
 		return err;
 
@@ -1576,6 +1610,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_info info;
 	bool metadata;
 	bool use_udp6_rx_checksums;
+	enum ifla_geneve_df df;
 	bool ttl_inherit;
 	int err;
 
@@ -1591,7 +1626,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
 	ttl_inherit = geneve->ttl_inherit;
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, true);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, true);
 	if (err)
 		return err;
 
@@ -1624,6 +1659,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_GENEVE_DF */
 		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
@@ -1672,6 +1708,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->df))
+		goto nla_put_failure;
+
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
 		goto nla_put_failure;
 
@@ -1723,7 +1762,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	init_tnl_info(&info, dst_port);
-	err = geneve_configure(net, dev, NULL, &info, true, true, false);
+	err = geneve_configure(net, dev, NULL, &info,
+			       true, true, false, GENEVE_DF_UNSET);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index efc588949431..f42c069d81db 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -566,10 +566,19 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
+	IFLA_GENEVE_DF,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
 
+enum ifla_geneve_df {
+	GENEVE_DF_UNSET = 0,
+	GENEVE_DF_SET,
+	GENEVE_DF_INHERIT,
+	__GENEVE_DF_END,
+	GENEVE_DF_MAX = __GENEVE_DF_END - 1,
+};
+
 /* PPP section */
 enum {
 	IFLA_PPP_UNSPEC,
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 07/11] selftests: pmtu: Introduce tests for IPv4/IPv6 over GENEVE over IPv6
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Use a router between endpoints, implemented via namespaces, set a low MTU
between router and destination endpoint, exceed it and check PMTU value in
route exceptions.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
This only introduces tests over GENEVE over IPv6 right now. I'll introduce
tests over IPv4 (they can be added trivially) once DF configuration support
is accepted into iproute2.

 tools/testing/selftests/net/pmtu.sh | 78 ++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 19ede74af560..e9bb0c37bdfc 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -37,6 +37,12 @@
 # - pmtu_ipv6_vxlan6_exception
 #	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
 #
+# - pmtu_ipv4_geneve6_exception
+#	Same as pmtu_ipv4_vxlan6, but using a GENEVE tunnel instead of VxLAN
+#
+# - pmtu_ipv6_geneve6_exception
+#	Same as pmtu_ipv6_vxlan6, but using a GENEVE tunnel instead of VxLAN
+#
 # - pmtu_vti4_exception
 #	Set up vti tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is not
@@ -85,6 +91,8 @@ tests="
 	pmtu_ipv6_exception		ipv6: PMTU exceptions
 	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions
 	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
+	pmtu_ipv4_geneve6_exception	IPv4 over geneve6: PMTU exceptions
+	pmtu_ipv6_geneve6_exception	IPv6 over geneve6: PMTU exceptions
 	pmtu_vti6_exception		vti6: PMTU exceptions
 	pmtu_vti4_exception		vti4: PMTU exceptions
 	pmtu_vti4_default_mtu		vti4: default MTU assignment
@@ -222,27 +230,42 @@ setup_vti6() {
 	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
 }
 
-setup_vxlan() {
-	a_addr="${1}"
-	b_addr="${2}"
+setup_vxlan_or_geneve() {
+	type="${1}"
+	a_addr="${2}"
+	b_addr="${3}"
+
+	if [ "${type}" = "vxlan" ]; then
+		opts="ttl 64 dstport 4789"
+		opts_a="local ${a_addr}"
+		opts_b="local ${b_addr}"
+	else
+		opts=""
+		opts_a=""
+		opts_b=""
+	fi
 
-	${ns_a} ip link add vxlan_a type vxlan id 1 local ${a_addr} remote ${b_addr} ttl 64 dstport 4789 || return 1
-	${ns_b} ip link add vxlan_b type vxlan id 1 local ${b_addr} remote ${a_addr} ttl 64 dstport 4789
+	${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
+	${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
 
-	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask}   dev vxlan_a
-	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask}   dev vxlan_b
+	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
+	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
 
-	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask}   dev vxlan_a
-	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask}   dev vxlan_b
+	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
+	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
 
-	${ns_a} ip link set vxlan_a up
-	${ns_b} ip link set vxlan_b up
+	${ns_a} ip link set ${type}_a up
+	${ns_b} ip link set ${type}_b up
 
 	sleep 1
 }
 
+setup_geneve6() {
+	setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
 setup_vxlan6() {
-	setup_vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+	setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
 }
 
 setup_xfrm() {
@@ -501,15 +524,16 @@ test_pmtu_ipv6_exception() {
 	test_pmtu_ipvX 6
 }
 
-test_pmtu_ipvX_over_vxlan6_exception() {
-	family=${1}
+test_pmtu_ipvX_over_vxlan6_or_geneve6_exception() {
+	type=${1}
+	family=${2}
 	ll_mtu=4000
 
-	setup namespaces routing vxlan6 || return 2
-	#                      IPv6 header   UDP header   VxLAN header   Ethernet header
-	exp_mtu=$((${ll_mtu} - 40          - 8          - 8            - 14))
+	setup namespaces routing ${type}6 || return 2
+	#                      IPv6 header   UDP header   VxLAN/GENEVE header   Ethernet header
+	exp_mtu=$((${ll_mtu} - 40          - 8          - 8                   - 14))
 
-	trace "${ns_a}" vxlan_a      "${ns_b}"  vxlan_b \
+	trace "${ns_a}" ${type}_a    "${ns_b}"  ${type}_b \
 	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
 	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
 
@@ -527,21 +551,29 @@ test_pmtu_ipvX_over_vxlan6_exception() {
 	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
 	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
 
-	mtu "${ns_a}" vxlan_a $((${ll_mtu} + 1000))
-	mtu "${ns_b}" vxlan_b $((${ll_mtu} + 1000))
+	mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
+	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
 	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
 
 	# Check that exception was created
 	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
-	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on VxLAN interface"
+	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
 }
 
 test_pmtu_ipv4_vxlan6_exception() {
-	test_pmtu_ipvX_over_vxlan6_exception 4
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception vxlan 4
 }
 
 test_pmtu_ipv6_vxlan6_exception() {
-	test_pmtu_ipvX_over_vxlan6_exception 6
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception vxlan 6
+}
+
+test_pmtu_ipv4_geneve6_exception() {
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception geneve 4
+}
+
+test_pmtu_ipv6_geneve6_exception() {
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception geneve 6
 }
 
 test_pmtu_vti4_exception() {
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 08/11] net: Convert protocol error handlers from void to int
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

We'll need this to handle ICMP errors for tunnels without a sending socket
(i.e. FoU and GUE). There, we might have to look up different types of IP
tunnels, registered as network protocols, before we get a match, so we
want this for the error handlers of IPPROTO_IPIP and IPPROTO_IPV6 in both
inet_protos and inet6_protos. These error codes will be used in the next
patch.

For consistency, return sensible error codes in protocol error handlers
whenever errors don't match a protocol or any of its states.

This has no effect on existing error handling paths.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/icmp.h        |  2 +-
 include/net/protocol.h    |  9 ++++++--
 include/net/sctp/sctp.h   |  2 +-
 include/net/tcp.h         |  2 +-
 include/net/udp.h         |  2 +-
 net/dccp/ipv4.c           | 13 +++++++----
 net/dccp/ipv6.c           | 13 +++++++----
 net/ipv4/gre_demux.c      |  9 ++++++--
 net/ipv4/icmp.c           |  6 +++--
 net/ipv4/ip_gre.c         | 48 ++++++++++++++++++++-------------------
 net/ipv4/ipip.c           | 14 ++++++------
 net/ipv4/tcp_ipv4.c       | 22 ++++++++++--------
 net/ipv4/tunnel4.c        | 18 ++++++++++-----
 net/ipv4/udp.c            | 10 ++++----
 net/ipv4/udp_impl.h       |  2 +-
 net/ipv4/udplite.c        |  4 ++--
 net/ipv4/xfrm4_protocol.c | 18 ++++++++++-----
 net/ipv6/icmp.c           |  4 +++-
 net/ipv6/ip6_gre.c        | 18 ++++++++-------
 net/ipv6/tcp_ipv6.c       | 13 +++++++----
 net/ipv6/tunnel6.c        | 12 ++++++----
 net/ipv6/udp.c            | 18 +++++++--------
 net/ipv6/udp_impl.h       |  4 ++--
 net/ipv6/udplite.c        |  5 ++--
 net/ipv6/xfrm6_protocol.c | 18 ++++++++++-----
 net/sctp/input.c          |  5 ++--
 net/sctp/ipv6.c           |  7 ++++--
 27 files changed, 177 insertions(+), 121 deletions(-)

diff --git a/include/net/icmp.h b/include/net/icmp.h
index 3ef2743a8eec..6ac3a5bd0117 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -41,7 +41,7 @@ struct net;
 
 void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info);
 int icmp_rcv(struct sk_buff *skb);
-void icmp_err(struct sk_buff *skb, u32 info);
+int icmp_err(struct sk_buff *skb, u32 info);
 int icmp_init(void);
 void icmp_out_count(struct net *net, unsigned char type);
 
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 4fc75f7ae23b..92b3eaad6088 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -42,7 +42,10 @@ struct net_protocol {
 	int			(*early_demux)(struct sk_buff *skb);
 	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
-	void			(*err_handler)(struct sk_buff *skb, u32 info);
+
+	/* This returns an error if we weren't able to handle the error. */
+	int			(*err_handler)(struct sk_buff *skb, u32 info);
+
 	unsigned int		no_policy:1,
 				netns_ok:1,
 				/* does the protocol do more stringent
@@ -58,10 +61,12 @@ struct inet6_protocol {
 	void    (*early_demux_handler)(struct sk_buff *skb);
 	int	(*handler)(struct sk_buff *skb);
 
-	void	(*err_handler)(struct sk_buff *skb,
+	/* This returns an error if we weren't able to handle the error. */
+	int	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
 			       u8 type, u8 code, int offset,
 			       __be32 info);
+
 	unsigned int	flags;	/* INET6_PROTO_xxx */
 };
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa370e0f..9a3b48a35e90 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -151,7 +151,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
  * sctp/input.c
  */
 int sctp_rcv(struct sk_buff *skb);
-void sctp_v4_err(struct sk_buff *skb, u32 info);
+int sctp_v4_err(struct sk_buff *skb, u32 info);
 void sctp_hash_endpoint(struct sctp_endpoint *);
 void sctp_unhash_endpoint(struct sctp_endpoint *);
 struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a18914d20486..4743836bed2e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -313,7 +313,7 @@ extern struct proto tcp_prot;
 
 void tcp_tasklet_init(void);
 
-void tcp_v4_err(struct sk_buff *skb, u32);
+int tcp_v4_err(struct sk_buff *skb, u32);
 
 void tcp_shutdown(struct sock *sk, int how);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 9e82cb391dea..7e3f1e2b68eb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -272,7 +272,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
 				  const struct sock *));
-void udp_err(struct sk_buff *, u32);
+int udp_err(struct sk_buff *, u32);
 int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udp_push_pending_frames(struct sock *sk);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8e08cea6f178..26a21d97b6b0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err);
  * check at all. A more general error queue to queue errors for later handling
  * is probably better.
  */
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
+static int dccp_v4_err(struct sk_buff *skb, u32 info)
 {
 	const struct iphdr *iph = (struct iphdr *)skb->data;
 	const u8 offset = iph->ihl << 2;
@@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 				       inet_iif(skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == DCCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = dccp_hdr_seq(dh);
-	if (sk->sk_state == DCCP_NEW_SYN_RECV)
-		return dccp_req_err(sk, seq);
+	if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+		dccp_req_err(sk, seq);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
@@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b18a6a..d5740bad5b18 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
 
 }
 
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == DCCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = dccp_hdr_seq(dh);
-	if (sk->sk_state == DCCP_NEW_SYN_RECV)
-		return dccp_req_err(sk, seq);
+	if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+		dccp_req_err(sk, seq);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
@@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ static int gre_rcv(struct sk_buff *skb)
 	return NET_RX_DROP;
 }
 
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
 {
 	const struct gre_protocol *proto;
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
 	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+	int err = 0;
 
 	if (ver >= GREPROTO_MAX)
-		return;
+		return -EINVAL;
 
 	rcu_read_lock();
 	proto = rcu_dereference(gre_proto[ver]);
 	if (proto && proto->err_handler)
 		proto->err_handler(skb, info);
+	else
+		err = -EPROTONOSUPPORT;
 	rcu_read_unlock();
+
+	return err;
 }
 
 static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ int icmp_rcv(struct sk_buff *skb)
 	goto drop;
 }
 
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
 {
 	struct iphdr *iph = (struct iphdr *)skb->data;
 	int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
 	 */
 	if (icmph->type != ICMP_ECHOREPLY) {
 		ping_err(skb, offset, info);
-		return;
+		return 0;
 	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
 	else if (type == ICMP_REDIRECT)
 		ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+	return 0;
 }
 
 /*
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..46eeb9663051 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
 static unsigned int erspan_net_id __read_mostly;
 
-static void ipgre_err(struct sk_buff *skb, u32 info,
-		      const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+		     const struct tnl_ptk_info *tpi)
 {
 
 	/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 	unsigned int data_len = 0;
 	struct ip_tunnel *t;
 
+	if (tpi->proto == htons(ETH_P_TEB))
+		itn = net_generic(net, gre_tap_net_id);
+	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+		 tpi->proto == htons(ETH_P_ERSPAN2))
+		itn = net_generic(net, erspan_net_id);
+	else
+		itn = net_generic(net, ipgre_net_id);
+
+	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+			     iph->daddr, iph->saddr, tpi->key);
+
+	if (!t)
+		return -ENOENT;
+
 	switch (type) {
 	default:
 	case ICMP_PARAMETERPROB:
-		return;
+		return 0;
 
 	case ICMP_DEST_UNREACH:
 		switch (code) {
 		case ICMP_SR_FAILED:
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
-			return;
+			return 0;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 
 	case ICMP_TIME_EXCEEDED:
 		if (code != ICMP_EXC_TTL)
-			return;
+			return 0;
 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 		break;
 
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 		break;
 	}
 
-	if (tpi->proto == htons(ETH_P_TEB))
-		itn = net_generic(net, gre_tap_net_id);
-	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
-		 tpi->proto == htons(ETH_P_ERSPAN2))
-		itn = net_generic(net, erspan_net_id);
-	else
-		itn = net_generic(net, ipgre_net_id);
-
-	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
-			     iph->daddr, iph->saddr, tpi->key);
-
-	if (!t)
-		return;
-
 #if IS_ENABLED(CONFIG_IPV6)
        if (tpi->proto == htons(ETH_P_IPV6) &&
            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 				       type, data_len))
-               return;
+               return 0;
 #endif
 
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
-		return;
+		return 0;
 
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-		return;
+		return 0;
 
 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
+
+	return 0;
 }
 
 static void gre_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	struct ip_tunnel *t;
 	int err = 0;
 
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+			     iph->daddr, iph->saddr, 0);
+	if (!t) {
+		err = -ENOENT;
+		goto out;
+	}
+
 	switch (type) {
 	case ICMP_DEST_UNREACH:
 		switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		goto out;
 	}
 
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-			     iph->daddr, iph->saddr, 0);
-	if (!t) {
-		err = -ENOENT;
-		goto out;
-	}
-
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
 		goto out;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..a336787d75e5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
  *
  */
 
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 {
 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 				       inet_iif(icmp_skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 	if (sk->sk_state == TCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = ntohl(th->seq);
-	if (sk->sk_state == TCP_NEW_SYN_RECV)
-		return tcp_req_err(sk, seq,
-				  type == ICMP_PARAMETERPROB ||
-				  type == ICMP_TIME_EXCEEDED ||
-				  (type == ICMP_DEST_UNREACH &&
-				   (code == ICMP_NET_UNREACH ||
-				    code == ICMP_HOST_UNREACH)));
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+				     type == ICMP_TIME_EXCEEDED ||
+				     (type == ICMP_DEST_UNREACH &&
+				      (code == ICMP_NET_UNREACH ||
+				       code == ICMP_HOST_UNREACH)));
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
@@ -613,6 +614,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ static int tunnelmpls4_rcv(struct sk_buff *skb)
 }
 #endif
 
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 #endif
 
 #if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 #endif
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1f054a85062d..b89c4cfd7c62 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -649,7 +649,7 @@ static struct sock *__udp4_lib_err_encap(struct net *net,
  * to find the appropriate port.
  */
 
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 {
 	struct inet_sock *inet;
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -672,7 +672,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 
 		if (!sk) {
 			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-			return;
+			return -ENOENT;
 		}
 		tunnel = true;
 	}
@@ -730,12 +730,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	return;
+	return 0;
 }
 
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, &udp_table);
+	return __udp4_lib_err(skb, info, &udp_table);
 }
 
 /*
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
 #include <net/inet_common.h>
 
 int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
 	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, &udplite_table);
+	return __udp4_lib_err(skb, info, &udplite_table);
 }
 
 static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(esp4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(ah4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(ipcomp4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct net_protocol esp4_protocol = {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c9c53ade55c3..5d7aa2c2770c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net)
 	return net->ipv6.icmp_sk[smp_processor_id()];
 }
 
-static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		       u8 type, u8 code, int offset, __be32 info)
 {
 	/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
@@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
 			ping_err(skb, offset, ntohl(info));
+
+	return 0;
 }
 
 static int icmpv6_rcv(struct sk_buff *skb);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 515adbdba1d2..81b69bcee714 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 }
 
 
-static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		       u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
@@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
 			     offset) < 0)
-		return;
+		return -EINVAL;
 
 	ipv6h = (const struct ipv6hdr *)skb->data;
 	t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
 				 tpi.key, tpi.proto);
 	if (!t)
-		return;
+		return -ENOENT;
 
 	switch (type) {
 		struct ipv6_tlv_tnl_enc_lim *tel;
@@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 				    t->parms.name);
 		if (code != ICMPV6_PORT_UNREACH)
 			break;
-		return;
+		return 0;
 	case ICMPV6_TIME_EXCEED:
 		if (code == ICMPV6_EXC_HOPLIMIT) {
 			net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
 					    t->parms.name);
 			break;
 		}
-		return;
+		return 0;
 	case ICMPV6_PARAMPROB:
 		teli = 0;
 		if (code == ICMPV6_HDR_FIELD)
@@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
 					    t->parms.name);
 		}
-		return;
+		return 0;
 	case ICMPV6_PKT_TOOBIG:
 		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
-		return;
+		return 0;
 	case NDISC_REDIRECT:
 		ip6_redirect(skb, net, skb->dev->ifindex, 0,
 			     sock_net_uid(net, NULL));
-		return;
+		return 0;
 	}
 
 	if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
@@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
+
+	return 0;
 }
 
 static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a2bc53..a3f559162521 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
 	}
 }
 
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		u8 type, u8 code, int offset, __be32 info)
 {
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == TCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = ntohl(th->seq);
 	fatal = icmpv6_err_convert(type, code, &err);
-	if (sk->sk_state == TCP_NEW_SYN_RECV)
-		return tcp_req_err(sk, seq, fatal);
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		tcp_req_err(sk, seq, fatal);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index dae25cad05cd..1991dede7367 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -134,24 +134,28 @@ static int tunnel46_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
-static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			 u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel46_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct inet6_protocol tunnel6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eebd90111646..34641cb5d358 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -512,9 +512,9 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
 	return sk;
 }
 
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    u8 type, u8 code, int offset, __be32 info,
-		    struct udp_table *udptable)
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		   u8 type, u8 code, int offset, __be32 info,
+		   struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -539,7 +539,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!sk) {
 			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 					  ICMP6_MIB_INERRORS);
-			return;
+			return -ENOENT;
 		}
 		tunnel = true;
 	}
@@ -578,7 +578,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	return;
+	return 0;
 }
 
 static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -609,11 +609,11 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static __inline__ void udpv6_err(struct sk_buff *skb,
-				 struct inet6_skb_parm *opt, u8 type,
-				 u8 code, int offset, __be32 info)
+static __inline__ int udpv6_err(struct sk_buff *skb,
+				struct inet6_skb_parm *opt, u8 type,
+				u8 code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+	return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
 static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 7903e21c178b..5730e6503cb4 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,8 +9,8 @@
 #include <net/transp_v6.h>
 
 int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
-		    __be32, struct udp_table *);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+		   __be32, struct udp_table *);
 
 int udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5000ad6878e6..a125aebc29e5 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb)
 	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
-static void udplitev6_err(struct sk_buff *skb,
+static int udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
 			  u8 type, u8 code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+	return __udp6_lib_err(skb, opt, type, code, offset, info,
+			      &udplite_table);
 }
 
 static const struct inet6_protocol udplitev6_protocol = {
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index b2dc8ce49378..cc979b702c89 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			  u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(esp6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm6_ah_rcv(struct sk_buff *skb)
@@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			 u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(ah6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
@@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			     u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(ipcomp6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct inet6_protocol esp6_protocol = {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99882ed..7ab08a5b36dc 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -574,7 +574,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
  * is probably better.
  *
  */
-void sctp_v4_err(struct sk_buff *skb, __u32 info)
+int sctp_v4_err(struct sk_buff *skb, __u32 info)
 {
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
 	const int ihlen = iph->ihl * 4;
@@ -599,7 +599,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	skb->transport_header = savesctp;
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 	/* Warning:  The sock lock is held.  Remember to call
 	 * sctp_err_finish!
@@ -653,6 +653,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 
 out_unlock:
 	sctp_err_finish(sk, transport);
+	return 0;
 }
 
 /*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fc6c5e4bffa5..6e27c62646e9 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -138,7 +138,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
 };
 
 /* ICMP error handler. */
-static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet6_dev *idev;
@@ -147,7 +147,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct sctp_transport *transport;
 	struct ipv6_pinfo *np;
 	__u16 saveip, savesctp;
-	int err;
+	int err, ret = 0;
 	struct net *net = dev_net(skb->dev);
 
 	idev = in6_dev_get(skb->dev);
@@ -163,6 +163,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	skb->transport_header = savesctp;
 	if (!sk) {
 		__ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS);
+		ret = -ENOENT;
 		goto out;
 	}
 
@@ -202,6 +203,8 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
+
+	return ret;
 }
 
 static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 09/11] udp: Support for error handlers of tunnels with arbitrary destination port
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.

Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VxLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.

Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/ip6_tunnel.h |  2 +
 include/net/ip_tunnels.h |  1 +
 net/ipv4/udp.c           | 75 +++++++++++++++++++++++++++----------
 net/ipv6/udp.c           | 80 ++++++++++++++++++++++++++++++----------
 4 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 236e40ba06bf..7855966b4a19 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -69,6 +69,8 @@ struct ip6_tnl_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
 			    u8 *protocol, struct flowi6 *fl6);
+	int (*err_handler)(struct sk_buff *, struct inet6_skb_parm *opt,
+			   u8 type, u8 code, int offset, __be32 info);
 };
 
 #ifdef CONFIG_INET
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index b0d022ff6ea1..5980659312e5 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -311,6 +311,7 @@ struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
 			    u8 *protocol, struct flowi4 *fl4);
+	int (*err_handler)(struct sk_buff *, u32);
 };
 
 #define MAX_IPTUN_ENCAP_OPS 8
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b89c4cfd7c62..83950b4faced 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
 #include <net/route.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -592,30 +593,48 @@ void udp_encap_enable(void)
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+	int i;
+
+	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+		int (*handler)(struct sk_buff *skb, u32 info);
+
+		if (!iptun_encaps[i])
+			continue;
+		handler = rcu_dereference(iptun_encaps[i]->err_handler);
+		if (handler && !handler(skb, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
 /* Try to match ICMP errors to UDP tunnels by looking up a socket without
  * reversing source and destination port: this will match tunnels that force the
- * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
- * tunnel implementation to match the error against a valid association.
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). If this doesn't
+ * match any socket, probe tunnels with arbitrary destination ports (e.g. FoU,
+ * GUE): there, the receiving socket is useless, as the port we've sent packets
+ * to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
  *
- * Return the socket if we have a match.
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
  */
 static struct sock *__udp4_lib_err_encap(struct net *net,
 					 const struct iphdr *iph,
 					 struct udphdr *uh,
 					 struct udp_table *udptable,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb, u32 info)
 {
-	int (*lookup)(struct sock *sk, struct sk_buff *skb);
 	int network_offset, transport_offset;
-	struct udp_sock *up;
 	struct sock *sk;
 
-	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
-			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
-			       udptable, NULL);
-	if (!sk)
-		return NULL;
-
 	network_offset = skb_network_offset(skb);
 	transport_offset = skb_transport_offset(skb);
 
@@ -627,10 +646,20 @@ static struct sock *__udp4_lib_err_encap(struct net *net,
 	/* Transport header needs to point to the UDP header */
 	skb_set_transport_header(skb, iph->ihl << 2);
 
-	up = udp_sk(sk);
-	lookup = READ_ONCE(up->encap_err_lookup);
-	if (!lookup || lookup(sk, skb))
-		sk = NULL;
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
+			       udptable, NULL);
+	if (sk) {
+		int (*lookup)(struct sock *sk, struct sk_buff *skb);
+		struct udp_sock *up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (!lookup || lookup(sk, skb))
+			sk = NULL;
+	}
+
+	if (!sk)
+		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
 
 	skb_set_transport_header(skb, transport_offset);
 	skb_set_network_header(skb, network_offset);
@@ -667,13 +696,19 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			       inet_sdif(skb), udptable, NULL);
 	if (!sk) {
 		/* No socket for error: try tunnels before discarding */
-		if (static_branch_unlikely(&udp_encap_needed_key))
-			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb);
+		sk = ERR_PTR(-ENOENT);
+		if (static_branch_unlikely(&udp_encap_needed_key)) {
+			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+						  info);
+			if (!sk)
+				return 0;
+		}
 
-		if (!sk) {
+		if (IS_ERR(sk)) {
 			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-			return -ENOENT;
+			return PTR_ERR(sk);
 		}
+
 		tunnel = true;
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 34641cb5d358..a3beeb6ee85a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -45,6 +45,7 @@
 #include <net/raw.h>
 #include <net/tcp_states.h>
 #include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
 #include <net/xfrm.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
@@ -470,30 +471,53 @@ void udpv6_encap_enable(void)
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+				      struct inet6_skb_parm *opt,
+				      u8 type, u8 code, int offset, u32 info)
+{
+	int i;
+
+	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+		int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			       u8 type, u8 code, int offset, u32 info);
+
+		if (!ip6tun_encaps[i])
+			continue;
+		handler = rcu_dereference(ip6tun_encaps[i]->err_handler);
+		if (handler && !handler(skb, opt, type, code, offset, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
 /* Try to match ICMP errors to UDP tunnels by looking up a socket without
  * reversing source and destination port: this will match tunnels that force the
- * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
- * tunnel implementation to match the error against a valid association.
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). If this doesn't
+ * match any socket, probe tunnels with arbitrary destination ports (e.g. FoU,
+ * GUE): there, the receiving socket is useless, as the port we've sent packets
+ * to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
  *
- * Return the socket if we have a match.
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
  */
 static struct sock *__udp6_lib_err_encap(struct net *net,
 					 const struct ipv6hdr *hdr, int offset,
 					 struct udphdr *uh,
 					 struct udp_table *udptable,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb,
+					 struct inet6_skb_parm *opt,
+					 u8 type, u8 code, __be32 info)
 {
-	int (*lookup)(struct sock *sk, struct sk_buff *skb);
 	int network_offset, transport_offset;
-	struct udp_sock *up;
 	struct sock *sk;
 
-	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
-			       &hdr->saddr, uh->dest,
-			       inet6_iif(skb), 0, udptable, skb);
-	if (!sk)
-		return NULL;
-
 	network_offset = skb_network_offset(skb);
 	transport_offset = skb_transport_offset(skb);
 
@@ -502,13 +526,26 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
 	/* Transport header needs to point to the UDP header */
 	skb_set_transport_header(skb, offset);
 
-	up = udp_sk(sk);
-	lookup = READ_ONCE(up->encap_err_lookup);
-	if (!lookup || lookup(sk, skb))
-		sk = NULL;
+	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+			       &hdr->saddr, uh->dest,
+			       inet6_iif(skb), 0, udptable, skb);
+	if (sk) {
+		int (*lookup)(struct sock *sk, struct sk_buff *skb);
+		struct udp_sock *up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (!lookup || lookup(sk, skb))
+			sk = NULL;
+	}
+
+	if (!sk) {
+		sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+							offset, info));
+	}
 
 	skb_set_transport_header(skb, transport_offset);
 	skb_set_network_header(skb, network_offset);
+
 	return sk;
 }
 
@@ -531,16 +568,21 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			       inet6_iif(skb), inet6_sdif(skb), udptable, skb);
 	if (!sk) {
 		/* No socket for error: try tunnels before discarding */
+		sk = ERR_PTR(-ENOENT);
 		if (static_branch_unlikely(&udpv6_encap_needed_key)) {
 			sk = __udp6_lib_err_encap(net, hdr, offset, uh,
-						  udptable, skb);
+						  udptable, skb,
+						  opt, type, code, info);
+			if (!sk)
+				return 0;
 		}
 
-		if (!sk) {
+		if (IS_ERR(sk)) {
 			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 					  ICMP6_MIB_INERRORS);
-			return -ENOENT;
+			return PTR_ERR(sk);
 		}
+
 		tunnel = true;
 	}
 
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 10/11] fou, fou6: ICMP error handlers for FoU and GUE
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

As the destination port in FoU and GUE receiving sockets doesn't
necessarily match the remote destination port, we can't associate errors
to the encapsulating tunnels with a socket lookup -- we need to blindly
try them instead. This means we don't even know if we are handling errors
for FoU or GUE without digging into the packets.

Hence, implement a single handler for both, one for IPv4 and one for IPv6,
that will check whether the packet that generated the ICMP error used a
direct IP encapsulation or if it had a GUE header, and send the error to
the matching protocol handler, if any.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 net/ipv4/fou.c      | 68 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/protocol.c |  1 +
 net/ipv6/fou6.c     | 74 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 500a59906b87..0d0ad19ecb87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -3,6 +3,7 @@
 #include <linux/socket.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
+#include <linux/icmp.h>
 #include <linux/udp.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	return 0;
 }
 
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+	const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+	if (ipprot && ipprot->err_handler) {
+		if (!ipprot->err_handler(skb, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+	int transport_offset = skb_transport_offset(skb);
+	struct guehdr *guehdr;
+	size_t optlen;
+	int ret;
+
+	if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+		return -EINVAL;
+
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	switch (guehdr->version) {
+	case 0: /* Full GUE header present */
+		break;
+	case 1: {
+		/* Direct encasulation of IPv4 or IPv6 */
+		skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+			goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+		case 6:
+			ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+			goto out;
+#endif
+		default:
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+	default: /* Undefined version */
+		return -EOPNOTSUPP;
+	}
+
+	if (guehdr->control)
+		return -ENOENT;
+
+	optlen = guehdr->hlen << 2;
+
+	if (validate_gue_flags(guehdr, optlen))
+		return -EINVAL;
+
+	skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+	ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+	skb_set_transport_header(skb, transport_offset);
+	return ret;
+}
+
 
 static const struct ip_tunnel_encap_ops fou_iptun_ops = {
 	.encap_hlen = fou_encap_hlen,
 	.build_header = fou_build_header,
+	.err_handler = gue_err,
 };
 
 static const struct ip_tunnel_encap_ops gue_iptun_ops = {
 	.encap_hlen = gue_encap_hlen,
 	.build_header = gue_build_header,
+	.err_handler = gue_err,
 };
 
 static int ip_tunnel_encap_add_fou_ops(void)
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..92d249e053be 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
 #include <net/protocol.h>
 
 struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
 const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 EXPORT_SYMBOL(inet_offloads);
 
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
index 6de3c04b0f30..bd675c61deb1 100644
--- a/net/ipv6/fou6.c
+++ b/net/ipv6/fou6.c
@@ -4,6 +4,7 @@
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/udp.h>
+#include <linux/icmpv6.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <net/fou.h>
@@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	return 0;
 }
 
+static int gue6_err_proto_handler(int proto, struct sk_buff *skb,
+				  struct inet6_skb_parm *opt,
+				  u8 type, u8 code, int offset, u32 info)
+{
+	const struct inet6_protocol *ipprot;
+
+	ipprot = rcu_dereference(inet6_protos[proto]);
+	if (ipprot && ipprot->err_handler) {
+		if (!ipprot->err_handler(skb, opt, type, code, offset, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		    u8 type, u8 code, int offset, __be32 info)
+{
+	int transport_offset = skb_transport_offset(skb);
+	struct guehdr *guehdr;
+	size_t optlen;
+	int ret;
+
+	if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+		return -EINVAL;
+
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	switch (guehdr->version) {
+	case 0: /* Full GUE header present */
+		break;
+	case 1: {
+		/* Direct encasulation of IPv4 or IPv6 */
+		skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt,
+						     type, code, offset, info);
+			goto out;
+		case 6:
+			ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt,
+						     type, code, offset, info);
+			goto out;
+		default:
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+	default: /* Undefined version */
+		return -EOPNOTSUPP;
+	}
+
+	if (guehdr->control)
+		return -ENOENT;
+
+	optlen = guehdr->hlen << 2;
+
+	if (validate_gue_flags(guehdr, optlen))
+		return -EINVAL;
+
+	skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+	ret = gue6_err_proto_handler(guehdr->proto_ctype, skb,
+				     opt, type, code, offset, info);
+
+out:
+	skb_set_transport_header(skb, transport_offset);
+	return ret;
+}
+
+
 static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
 	.encap_hlen = fou_encap_hlen,
 	.build_header = fou6_build_header,
+	.err_handler = gue6_err,
 };
 
 static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
 	.encap_hlen = gue_encap_hlen,
 	.build_header = gue6_build_header,
+	.err_handler = gue6_err,
 };
 
 static int ip6_tnl_encap_add_fou_ops(void)
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 11/11] selftests: pmtu: Introduce FoU and GUE PMTU exceptions tests
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Introduce eight tests, for FoU and GUE, with IPv4 and IPv6 payload,
on IPv4 and IPv6 transport, that check that PMTU exceptions are created
with the right value when exceeding the MTU on a link of the path.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index e9bb0c37bdfc..b746bcb29e0f 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -43,6 +43,14 @@
 # - pmtu_ipv6_geneve6_exception
 #	Same as pmtu_ipv6_vxlan6, but using a GENEVE tunnel instead of VxLAN
 #
+# - pmtu_ipv{4,6}_fou{4,6}_exception
+#	Same as pmtu_ipv4_vxlan6, but using a direct IPv4/IPv6 encapsulation
+#	(FoU) over IPv4/IPv6, instead of VxLAN
+#
+# - pmtu_ipv{4,6}_fou{4,6}_exception
+#	Same as pmtu_ipv4_vxlan6, but using a generic UDP IPv4/IPv6
+#	encapsulation (GUE) over IPv4/IPv6, instead of VxLAN
+#
 # - pmtu_vti4_exception
 #	Set up vti tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is not
@@ -93,6 +101,14 @@ tests="
 	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
 	pmtu_ipv4_geneve6_exception	IPv4 over geneve6: PMTU exceptions
 	pmtu_ipv6_geneve6_exception	IPv6 over geneve6: PMTU exceptions
+	pmtu_ipv4_fou4_exception	IPv4 over fou4: PMTU exceptions
+	pmtu_ipv6_fou4_exception	IPv6 over fou4: PMTU exceptions
+	pmtu_ipv4_fou6_exception	IPv4 over fou6: PMTU exceptions
+	pmtu_ipv6_fou6_exception	IPv6 over fou6: PMTU exceptions
+	pmtu_ipv4_gue4_exception	IPv4 over gue4: PMTU exceptions
+	pmtu_ipv6_gue4_exception	IPv6 over gue4: PMTU exceptions
+	pmtu_ipv4_gue6_exception	IPv4 over gue6: PMTU exceptions
+	pmtu_ipv6_gue6_exception	IPv6 over gue6: PMTU exceptions
 	pmtu_vti6_exception		vti6: PMTU exceptions
 	pmtu_vti4_exception		vti4: PMTU exceptions
 	pmtu_vti4_default_mtu		vti4: default MTU assignment
@@ -180,6 +196,89 @@ nsname() {
 	eval echo \$NS_$1
 }
 
+setup_fou_or_gue() {
+	outer="${1}"
+	inner="${2}"
+	encap="${3}"
+
+	if [ "${outer}" = "4" ]; then
+		modprobe fou || return 2
+		a_addr="${prefix4}.${a_r1}.1"
+		b_addr="${prefix4}.${b_r1}.1"
+		if [ "${inner}" = "4" ]; then
+			type="ipip"
+			ipproto="4"
+		else
+			type="sit"
+			ipproto="41"
+		fi
+	else
+		modprobe fou6 || return 2
+		a_addr="${prefix6}:${a_r1}::1"
+		b_addr="${prefix6}:${b_r1}::1"
+		if [ "${inner}" = "4" ]; then
+			type="ip6tnl"
+			mode="mode ipip6"
+			ipproto="4 -6"
+		else
+			type="ip6tnl"
+			mode="mode ip6ip6"
+			ipproto="41 -6"
+		fi
+	fi
+
+	${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2
+	${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2
+
+	${ns_b} ip fou add port 5556 ipproto ${ipproto}
+	${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555
+
+	if [ "${inner}" = "4" ]; then
+		${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${encap}_a
+		${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${encap}_b
+	else
+		${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${encap}_a
+		${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${encap}_b
+	fi
+
+	${ns_a} ip link set ${encap}_a up
+	${ns_b} ip link set ${encap}_b up
+
+	sleep 1
+}
+
+setup_fou44() {
+	setup_fou_or_gue 4 4 fou
+}
+
+setup_fou46() {
+	setup_fou_or_gue 4 6 fou
+}
+
+setup_fou64() {
+	setup_fou_or_gue 6 4 fou
+}
+
+setup_fou66() {
+	setup_fou_or_gue 6 6 fou
+}
+
+setup_gue44() {
+	setup_fou_or_gue 4 4 gue
+}
+
+setup_gue46() {
+	setup_fou_or_gue 4 6 gue
+}
+
+setup_gue64() {
+	setup_fou_or_gue 6 4 gue
+}
+
+setup_gue66() {
+	setup_fou_or_gue 6 6 gue
+}
+
 setup_namespaces() {
 	for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
 		ip netns add ${n} || return 1
@@ -560,6 +659,86 @@ test_pmtu_ipvX_over_vxlan6_or_geneve6_exception() {
 	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
 }
 
+test_pmtu_ipvX_over_fouY_or_gueY() {
+	inner_family=${1}
+	outer_family=${2}
+	encap=${3}
+	ll_mtu=4000
+
+	setup namespaces routing ${encap}${outer_family}${inner_family} || return 2
+	trace "${ns_a}" ${encap}_a   "${ns_b}"  ${encap}_b \
+	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
+
+	if [ ${inner_family} -eq 4 ]; then
+		ping=ping
+		dst=${tunnel4_b_addr}
+	else
+		ping=${ping6}
+		dst=${tunnel6_b_addr}
+	fi
+
+	if [ "${encap}" = "gue" ]; then
+		encap_overhead=4
+	else
+		encap_overhead=0
+	fi
+
+	if [ ${outer_family} -eq 4 ]; then
+		#                      IPv4 header   UDP header
+		exp_mtu=$((${ll_mtu} - 20          - 8         - ${encap_overhead}))
+	else
+		#                      IPv6 header   Option 4   UDP header
+		exp_mtu=$((${ll_mtu} - 40          - 8        - 8       - ${encap_overhead}))
+	fi
+
+	# Create route exception by exceeding link layer MTU
+	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
+	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
+	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+	mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
+	mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
+	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
+
+	# Check that exception was created
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${encap} interface"
+}
+
+test_pmtu_ipv4_fou4_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 4 4 fou
+}
+
+test_pmtu_ipv6_fou4_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 6 4 fou
+}
+
+test_pmtu_ipv4_fou6_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 4 6 fou
+}
+
+test_pmtu_ipv6_fou6_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 6 6 fou
+}
+
+test_pmtu_ipv4_gue4_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 4 4 gue
+}
+
+test_pmtu_ipv6_gue4_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 6 4 gue
+}
+
+test_pmtu_ipv4_gue6_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 4 6 gue
+}
+
+test_pmtu_ipv6_gue6_exception() {
+	test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue
+}
+
 test_pmtu_ipv4_vxlan6_exception() {
 	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception vxlan 4
 }
-- 
2.19.1

^ permalink raw reply related

* [PATCH iproute2 net-next 0/2] Add DF configuration for VxLAN and GENEVE link types
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev

This series adds configuration of the DF bit in outgoing IPv4 packets for
VxLAN and GENEVE link types.

I also included uapi/linux/if_link.h changes for convenience, please let
me know if I should repost without them.

Stefano Brivio (2):
  iplink_vxlan: Add DF configuration
  iplink_geneve: Add DF configuration

 include/uapi/linux/if_link.h | 18 ++++++++++++++++++
 ip/iplink_geneve.c           | 29 +++++++++++++++++++++++++++++
 ip/iplink_vxlan.c            | 29 +++++++++++++++++++++++++++++
 man/man8/ip-link.8.in        | 28 ++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)

-- 
2.19.1

^ permalink raw reply

* [PATCH iproute2 net-next 1/2] iplink_vxlan: Add DF configuration
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev
In-Reply-To: <cover.1541535725.git.sbrivio@redhat.com>

Allow to set the DF bit behaviour for outgoing IPv4 packets: it can be
always on, inherited from the inner header, or, by default, always off,
which is the current behaviour.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/uapi/linux/if_link.h |  9 +++++++++
 ip/iplink_vxlan.c            | 29 +++++++++++++++++++++++++++++
 man/man8/ip-link.8.in        | 14 ++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9c254603ebda..4caf683ce546 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -530,6 +530,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -539,6 +540,14 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 7fc0e2b4eb06..86afbe1334f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -31,6 +31,7 @@ static void print_explain(FILE *f)
 		"                 [ local ADDR ]\n"
 		"                 [ ttl TTL ]\n"
 		"                 [ tos TOS ]\n"
+		"                 [ df DF ]\n"
 		"                 [ flowlabel LABEL ]\n"
 		"                 [ dev PHYS_DEV ]\n"
 		"                 [ dstport PORT ]\n"
@@ -52,6 +53,7 @@ static void print_explain(FILE *f)
 		"       ADDR  := { IP_ADDRESS | any }\n"
 		"       TOS   := { NUMBER | inherit }\n"
 		"       TTL   := { 1..255 | auto | inherit }\n"
+		"       DF    := { unset | set | inherit }\n"
 		"       LABEL := 0-1048575\n"
 	);
 }
@@ -170,6 +172,22 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv,
 			} else
 				tos = 1;
 			addattr8(n, 1024, IFLA_VXLAN_TOS, tos);
+		} else if (!matches(*argv, "df")) {
+			enum ifla_vxlan_df df;
+
+			NEXT_ARG();
+			check_duparg(&attrs, IFLA_VXLAN_DF, "df", *argv);
+			if (strcmp(*argv, "unset") == 0)
+				df = VXLAN_DF_UNSET;
+			else if (strcmp(*argv, "set") == 0)
+				df = VXLAN_DF_SET;
+			else if (strcmp(*argv, "inherit") == 0)
+				df = VXLAN_DF_INHERIT;
+			else
+				invarg("DF must be 'unset', 'set' or 'inherit'",
+				       *argv);
+
+			addattr8(n, 1024, IFLA_VXLAN_DF, df);
 		} else if (!matches(*argv, "label") ||
 			   !matches(*argv, "flowlabel")) {
 			__u32 uval;
@@ -538,6 +556,17 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 			print_string(PRINT_FP, NULL, "ttl %s ", "auto");
 	}
 
+	if (tb[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = rta_getattr_u8(tb[IFLA_VXLAN_DF]);
+
+		if (df == VXLAN_DF_UNSET)
+			print_string(PRINT_JSON, "df", "df %s ", "unset");
+		else if (df == VXLAN_DF_SET)
+			print_string(PRINT_ANY, "df", "df %s ", "set");
+		else if (df == VXLAN_DF_INHERIT)
+			print_string(PRINT_ANY, "df", "df %s ", "inherit");
+	}
+
 	if (tb[IFLA_VXLAN_LABEL]) {
 		__u32 label = rta_getattr_u32(tb[IFLA_VXLAN_LABEL]);
 
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 5132f514b279..1b899dd06b92 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -496,6 +496,8 @@ the following additional arguments are supported:
 ] [
 .BI tos " TOS "
 ] [
+.BI df " DF "
+] [
 .BI flowlabel " FLOWLABEL "
 ] [
 .BI dstport " PORT "
@@ -565,6 +567,18 @@ parameter.
 .BI tos " TOS"
 - specifies the TOS value to use in outgoing packets.
 
+.sp
+.BI df " DF"
+- specifies the usage of the DF bit in outgoing packets with IPv4 headers.
+The value
+.B inherit
+causes the bit to be copied from the original IP header. The values
+.B unset
+and
+.B set
+cause the bit to be always unset or always set, respectively. By default, the
+bit is not set.
+
 .sp
 .BI flowlabel " FLOWLABEL"
 - specifies the flow label to use in outgoing packets.
-- 
2.19.1

^ permalink raw reply related

* [PATCH iproute2 net-next 2/2] iplink_geneve: Add DF configuration
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev
In-Reply-To: <cover.1541535725.git.sbrivio@redhat.com>

Allow to set the DF bit behaviour for outgoing IPv4 packets: it can be
always on, inherited from the inner header, or, by default, always off,
which is the current behaviour.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/uapi/linux/if_link.h |  9 +++++++++
 ip/iplink_geneve.c           | 29 +++++++++++++++++++++++++++++
 man/man8/ip-link.8.in        | 14 ++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4caf683ce546..183ca7527178 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -563,10 +563,19 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
+	IFLA_GENEVE_DF,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
 
+enum ifla_geneve_df {
+	GENEVE_DF_UNSET = 0,
+	GENEVE_DF_SET,
+	GENEVE_DF_INHERIT,
+	__GENEVE_DF_END,
+	GENEVE_DF_MAX = __GENEVE_DF_END - 1,
+};
+
 /* PPP section */
 enum {
 	IFLA_PPP_UNSPEC,
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index c417842b2a5b..1872b74c5d70 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -24,6 +24,7 @@ static void print_explain(FILE *f)
 		"                  remote ADDR\n"
 		"                  [ ttl TTL ]\n"
 		"                  [ tos TOS ]\n"
+		"                  [ df DF ]\n"
 		"                  [ flowlabel LABEL ]\n"
 		"                  [ dstport PORT ]\n"
 		"                  [ [no]external ]\n"
@@ -35,6 +36,7 @@ static void print_explain(FILE *f)
 		"       ADDR  := IP_ADDRESS\n"
 		"       TOS   := { NUMBER | inherit }\n"
 		"       TTL   := { 1..255 | auto | inherit }\n"
+		"       DF    := { unset | set | inherit }\n"
 		"       LABEL := 0-1048575\n"
 	);
 }
@@ -115,6 +117,22 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv,
 				tos = uval;
 			} else
 				tos = 1;
+		} else if (!matches(*argv, "df")) {
+			enum ifla_geneve_df df;
+
+			NEXT_ARG();
+			check_duparg(&attrs, IFLA_GENEVE_DF, "df", *argv);
+			if (strcmp(*argv, "unset") == 0)
+				df = GENEVE_DF_UNSET;
+			else if (strcmp(*argv, "set") == 0)
+				df = GENEVE_DF_SET;
+			else if (strcmp(*argv, "inherit") == 0)
+				df = GENEVE_DF_INHERIT;
+			else
+				invarg("DF must be 'unset', 'set' or 'inherit'",
+				       *argv);
+
+			addattr8(n, 1024, IFLA_GENEVE_DF, df);
 		} else if (!matches(*argv, "label") ||
 			   !matches(*argv, "flowlabel")) {
 			__u32 uval;
@@ -287,6 +305,17 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 			print_string(PRINT_FP, NULL, "tos %s ", "inherit");
 	}
 
+	if (tb[IFLA_GENEVE_DF]) {
+		enum ifla_geneve_df df = rta_getattr_u8(tb[IFLA_GENEVE_DF]);
+
+		if (df == GENEVE_DF_UNSET)
+			print_string(PRINT_JSON, "df", "df %s ", "unset");
+		else if (df == GENEVE_DF_SET)
+			print_string(PRINT_ANY, "df", "df %s ", "set");
+		else if (df == GENEVE_DF_INHERIT)
+			print_string(PRINT_ANY, "df", "df %s ", "inherit");
+	}
+
 	if (tb[IFLA_GENEVE_LABEL]) {
 		__u32 label = rta_getattr_u32(tb[IFLA_GENEVE_LABEL]);
 
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 1b899dd06b92..568e0aa02579 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -1180,6 +1180,8 @@ the following additional arguments are supported:
 ] [
 .BI tos " TOS "
 ] [
+.BI df " DF "
+] [
 .BI flowlabel " FLOWLABEL "
 ] [
 .BI dstport " PORT"
@@ -1212,6 +1214,18 @@ ttl. Default option is "0".
 .BI tos " TOS"
 - specifies the TOS value to use in outgoing packets.
 
+.sp
+.BI df " DF"
+- specifies the usage of the DF bit in outgoing packets with IPv4 headers.
+The value
+.B inherit
+causes the bit to be copied from the original IP header. The values
+.B unset
+and
+.B set
+cause the bit to be always unset or always set, respectively. By default, the
+bit is not set.
+
 .sp
 .BI flowlabel " FLOWLABEL"
 - specifies the flow label to use in outgoing packets.
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
From: Alexei Starovoitov @ 2018-11-06 21:56 UTC (permalink / raw)
  To: Edward Cree
  Cc: Martin Lau, Yonghong Song, Alexei Starovoitov,
	daniel@iogearbox.net, netdev@vger.kernel.org, Kernel Team
In-Reply-To: <84383282-9439-c523-1f1d-9c4445ba1c69@solarflare.com>

On Tue, Nov 06, 2018 at 06:52:11PM +0000, Edward Cree wrote:
> On 06/11/18 06:29, Alexei Starovoitov wrote:
> > BTF is not pure type information. BTF is everything that verifier needs
> > to know to make safety decisions that bpf instruction set doesn't have.
> Yes, I'm not disputing that and never have.
> I'm just saying that it will be much cleaner and better if it's
>  internally organised differently.
> 
> > Splitting pure types into one section, variables into another,
> > functions into yet another is not practical, since the same
> > modifiers (like const or volatile) need to be applied to
> > variables and functions. At the end all sections will have
> > the same style of encoding, hence no need to duplicate
> > the encoding three times and instead it's cleaner to encode
> > all of them BTF-style via different KINDs.
> This shows that you've misunderstood what I'm proposing, probably
>  I explained it poorly so I'll try again.
> I'm not suggesting that the 'functions' and 'variables' sections
>  would have _type_ records in them, only that they would reference
>  records in the 'types' section.  So if for instance we have
>     int foo(int x) { int quux; /* ... */ }
>     int bar(int y) { /* ... */ }
>  in the source, then in the 'types' section we would have
> 1 INT 32bits encoding=signed offset=0
> 2 FUNC args=(name="x" type=1,), ret=1
> 3 FUNC args=(name="y" type=1,), ret=1
>  while in the 'functions' section we would have
> 1 "foo" type=2 start_insn_idx=23 insn_count=19 (... maybe other info too...)
> 2 "bar" type=3 start_insn_idx=42 insn_count=5

that looks very weird to me. Why split func name from argument names?
The 'function name' as seen by the BTF may not be the symbol name
as seen in elf file.
Like C++ mangles names for elf. If/when we have C++ front-end for BPF
the BTF func name will be 'class::method' string.
While elf symbol name will still be mangled string in the elf file.

btw libbpf processes that elf symbol name for bpf prog already
and passes it to the kernel as bpf_attr.prog_name.
BTF's func name should be the one seen in the source.
Whatever that source code might be.
There are C, bpftrace, p4 and python frontends. These languages
should be free to put into BTF KIND_FUNC name that makes sense
from the language point of view.
Ideally it's C-like name, so when bpftool prints that BTF
it would be meaningful.

btw we've been thinking to teach libbpf to generate BTF KIND_FUNC
on the fly based on elf symbol name (when real BTF is missing in the elf),
but decided not to go that route for now.

>  and in the 'variables' section we might have
> 1 "quux" type=1 where=stack func=1 offset=-8

that doesn't work. stack slots can be reused by compiler.
variable can be in the register too.
right now we're not planning to have an equivalent of such dwarf
debug info in BTF, since -O2 is mandatory and with optimizations
variable tracking (gcc term) is not effective.
Instead we will annotate every load/store with btf type id.
Meaning no plans to tackle local variables.

The global variables for given .c file will look like single KIND_STRUCT
(which is variable length)
and when libbpf will learn to 'link' multiple .o the BTF deduplication
work (which we're doing for vmlinux) will apply as-is to
combine multiple .o together.

> 
> Thus the graph of types lives entirely in the 'types' section, but
>  things-that-are-not-types don't.  I'm not making a distinction
>  between "pure types" and (somehow) impure types; I'm making a
>  distinction between types (with all their impurities) and
>  *instances* of those types.
> Note that these 'sections' may all really be regions of the '.BTF'
>  ELF section, if that makes the implementation easier.  Also, the
>  'functions' and 'variables' sections _won't_ have the same style
>  of encoding as the 'types', because they're storing entirely
>  different data and in fact don't need variable record sizes.

yes we do see these things differently.
To us function name is the debug info that fits well into BTF description.
Whereas you see the function name part of function declaration
as something 'entirely different'. I don't quite get that point.
In C elf symbol name and in-source func name are the same,
which is probably causing this terminology confusion.

^ permalink raw reply

* Re: KASAN: use-after-free Read in crypto_gcm_init_common
From: Ard Biesheuvel @ 2018-11-07  7:26 UTC (permalink / raw)
  To: syzbot, Boris Pismenny, Aviad Yehezkel, Dave Watson,
	John Fastabend, Daniel Borkmann, <netdev@vger.kernel.org>
  Cc: David S. Miller, Herbert Xu,
	open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	Linux Kernel Mailing List, syzkaller-bugs
In-Reply-To: <00000000000060e0ae057a092be8@google.com>

(+ TLS maintainers and netdev)

On 7 November 2018 at 02:38, syzbot
<syzbot+e736399a2c4054612307@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    4710e78940d8 Merge tag 'nfs-for-4.20-2' of git://git.linux..
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=17a68093400000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=9384ecb1c973baed
> dashboard link: https://syzkaller.appspot.com/bug?extid=e736399a2c4054612307
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=17902f5b400000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=111377e5400000
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+e736399a2c4054612307@syzkaller.appspotmail.com
>
> startpar (5496) used greatest stack depth: 15744 bytes left
> ==================================================================
> BUG: KASAN: use-after-free in memcpy include/linux/string.h:352 [inline]
> BUG: KASAN: use-after-free in crypto_gcm_init_common+0xe2/0x710
> crypto/gcm.c:181
> Read of size 12 at addr ffff8801d7b91b00 by task kworker/1:2/3205
>
> CPU: 1 PID: 3205 Comm: kworker/1:2 Not tainted 4.19.0+ #320
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Workqueue: pencrypt padata_parallel_worker
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x244/0x39d lib/dump_stack.c:113
>  print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
>  check_memory_region_inline mm/kasan/kasan.c:260 [inline]
>  check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
>  memcpy+0x23/0x50 mm/kasan/kasan.c:302
>  memcpy include/linux/string.h:352 [inline]
>  crypto_gcm_init_common+0xe2/0x710 crypto/gcm.c:181
>  crypto_gcm_encrypt+0xe2/0x6b0 crypto/gcm.c:479
>  crypto_aead_encrypt include/crypto/aead.h:364 [inline]
>  pcrypt_aead_enc+0xd6/0x340 crypto/pcrypt.c:143
>  padata_parallel_worker+0x49d/0x760 kernel/padata.c:86
>  process_one_work+0xc90/0x1c40 kernel/workqueue.c:2153
>  worker_thread+0x17f/0x1390 kernel/workqueue.c:2296
>  kthread+0x35a/0x440 kernel/kthread.c:246
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
>
> Allocated by task 5668:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
>  kmem_cache_alloc_trace+0x152/0x750 mm/slab.c:3620
>  kmalloc include/linux/slab.h:546 [inline]
>  tls_set_sw_offload+0xcb3/0x1390 net/tls/tls_sw.c:1945
>  do_tls_setsockopt_conf net/tls/tls_main.c:467 [inline]
>  do_tls_setsockopt net/tls/tls_main.c:514 [inline]
>  tls_setsockopt+0x689/0x770 net/tls/tls_main.c:533
>  sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:2978
>  __sys_setsockopt+0x1ba/0x3c0 net/socket.c:1902
>  __do_sys_setsockopt net/socket.c:1913 [inline]
>  __se_sys_setsockopt net/socket.c:1910 [inline]
>  __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1910
>  do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 5668:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
>  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>  __cache_free mm/slab.c:3498 [inline]
>  kfree+0xcf/0x230 mm/slab.c:3817
>  tls_sk_proto_close+0x5fa/0x750 net/tls/tls_main.c:277
>  inet_release+0x104/0x1f0 net/ipv4/af_inet.c:428
>  inet6_release+0x50/0x70 net/ipv6/af_inet6.c:458
>  __sock_release+0xd7/0x250 net/socket.c:579
>  sock_close+0x19/0x20 net/socket.c:1141
>  __fput+0x385/0xa30 fs/file_table.c:278
>  ____fput+0x15/0x20 fs/file_table.c:309
>  task_work_run+0x1e8/0x2a0 kernel/task_work.c:113
>  tracehook_notify_resume include/linux/tracehook.h:188 [inline]
>  exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166
>  prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
>  syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
>  do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> The buggy address belongs to the object at ffff8801d7b91b00
>  which belongs to the cache kmalloc-32 of size 32
> The buggy address is located 0 bytes inside of
>  32-byte region [ffff8801d7b91b00, ffff8801d7b91b20)
> The buggy address belongs to the page:
> page:ffffea00075ee440 count:1 mapcount:0 mapping:ffff8801da8001c0
> index:0xffff8801d7b91fc1
> flags: 0x2fffc0000000200(slab)
> raw: 02fffc0000000200 ffffea00075effc8 ffff8801da801248 ffff8801da8001c0
> raw: ffff8801d7b91fc1 ffff8801d7b91000 000000010000003f 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
>  ffff8801d7b91a00: fb fb fb fb fc fc fc fc 00 00 fc fc fc fc fc fc
>  ffff8801d7b91a80: 00 00 fc fc fc fc fc fc fb fb fb fb fc fc fc fc
>>
>> ffff8801d7b91b00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
>
>                    ^
>  ffff8801d7b91b80: fb fb fb fb fc fc fc fc 00 00 fc fc fc fc fc fc
>  ffff8801d7b91c00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc
> ==================================================================
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
> syzbot.
> syzbot can test patches for this bug, for details see:
> https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Jason Gunthorpe @ 2018-11-06 22:02 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Leon Romanovsky, Yishai Hadas, RDMA mailing list, Artemy Kovalyov,
	Saeed Mahameed, linux-netdev, Leon Romanovsky
In-Reply-To: <cb4d41e3a24c667c5c4e0492d3d3b21a5a165919.camel@redhat.com>

On Tue, Nov 06, 2018 at 04:31:08PM -0500, Doug Ledford wrote:
> On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> > From: Yishai Hadas <yishaih@mellanox.com>
> > 
> > Adapt XRC SRQ to the latest HW specification with fixed definition
> > around umem valid bits. The previous definition relied on a bit which
> > was taken for other purposes in legacy FW.
> > 
> > Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> > Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > Hi Doug, Jason
> > 
> > This commit fixes code sent in this merge window, so I'm not marking it
> > with any rdma-rc/rdma-next. It will be better to be sent during this merge
> > window if you have extra pull request to issue, or as a -rc material, if
> > not.
> > 
> > BTW, we didn't combine reserved fields, because our convention is to align such
> > fields to 32 bits for better readability.
> > 
> > Thanks
> 
> This looks fine.  Let me know when it's in the mlx5-next tree to pull.

It needs to go to -rc... 

This needs a mlx5-rc branch for this I guess?

Jason 

^ permalink raw reply

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Doug Ledford @ 2018-11-06 22:10 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Yishai Hadas, RDMA mailing list, Artemy Kovalyov,
	Saeed Mahameed, linux-netdev, Leon Romanovsky
In-Reply-To: <20181106220206.GC30490@mellanox.com>

[-- Attachment #1: Type: text/plain, Size: 1768 bytes --]

On Tue, 2018-11-06 at 22:02 +0000, Jason Gunthorpe wrote:
> On Tue, Nov 06, 2018 at 04:31:08PM -0500, Doug Ledford wrote:
> > On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> > > From: Yishai Hadas <yishaih@mellanox.com>
> > > 
> > > Adapt XRC SRQ to the latest HW specification with fixed definition
> > > around umem valid bits. The previous definition relied on a bit which
> > > was taken for other purposes in legacy FW.
> > > 
> > > Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> > > Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> > > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > > Hi Doug, Jason
> > > 
> > > This commit fixes code sent in this merge window, so I'm not marking it
> > > with any rdma-rc/rdma-next. It will be better to be sent during this merge
> > > window if you have extra pull request to issue, or as a -rc material, if
> > > not.
> > > 
> > > BTW, we didn't combine reserved fields, because our convention is to align such
> > > fields to 32 bits for better readability.
> > > 
> > > Thanks
> > 
> > This looks fine.  Let me know when it's in the mlx5-next tree to pull.
> 
> It needs to go to -rc... 
> 
> This needs a mlx5-rc branch for this I guess?

I don't think so.  As long as it's the first commit in mlx5-next, and
mlx5-next is 4.20-rc1 based, then pulling this commit into the -rc tree
will only pull the single commit.  Then when we pull into for-next for
the first time, we will get this in for-next too.  That seems best to
me.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Jason Gunthorpe @ 2018-11-06 22:11 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Leon Romanovsky, Yishai Hadas, RDMA mailing list, Artemy Kovalyov,
	Saeed Mahameed, linux-netdev, Leon Romanovsky
In-Reply-To: <581ad7a59e00c7ee01a65940637df25ad826041c.camel@redhat.com>

On Tue, Nov 06, 2018 at 05:10:53PM -0500, Doug Ledford wrote:
> On Tue, 2018-11-06 at 22:02 +0000, Jason Gunthorpe wrote:
> > On Tue, Nov 06, 2018 at 04:31:08PM -0500, Doug Ledford wrote:
> > > On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> > > > From: Yishai Hadas <yishaih@mellanox.com>
> > > > 
> > > > Adapt XRC SRQ to the latest HW specification with fixed definition
> > > > around umem valid bits. The previous definition relied on a bit which
> > > > was taken for other purposes in legacy FW.
> > > > 
> > > > Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> > > > Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> > > > Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> > > > Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> > > > Hi Doug, Jason
> > > > 
> > > > This commit fixes code sent in this merge window, so I'm not marking it
> > > > with any rdma-rc/rdma-next. It will be better to be sent during this merge
> > > > window if you have extra pull request to issue, or as a -rc material, if
> > > > not.
> > > > 
> > > > BTW, we didn't combine reserved fields, because our convention is to align such
> > > > fields to 32 bits for better readability.
> > > > 
> > > > Thanks
> > > 
> > > This looks fine.  Let me know when it's in the mlx5-next tree to pull.
> > 
> > It needs to go to -rc... 
> > 
> > This needs a mlx5-rc branch for this I guess?
> 
> I don't think so.  As long as it's the first commit in mlx5-next, and
> mlx5-next is 4.20-rc1 based, then pulling this commit into the -rc tree
> will only pull the single commit.  Then when we pull into for-next for
> the first time, we will get this in for-next too.  That seems best to
> me.

That works too, if Leon is fast :)

Jason

^ permalink raw reply

* [PATCH net-next] ipv6: gro: do not use slow memcmp() in ipv6_gro_receive()
From: Eric Dumazet @ 2018-11-06 22:25 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

ipv6_gro_receive() compares 34 bytes using slow memcmp(),
while handcoding with a couple of ipv6_addr_equal() is much faster.

Before this patch, "perf top -e cycles:pp -C <cpu>" would
see memcmp() using ~10% of cpu cycles on a 40Gbit NIC
receiving IPv6 TCP traffic.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv6/ip6_offload.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index c7e495f1201105f1ac1724a7b8fd82399efcce32..70f525c33cb6c1f375919b94a7afc45cc6bdcd5f 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -229,14 +229,21 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
 		 * XXX skbs on the gro_list have all been parsed and pulled
 		 * already so we don't need to compare nlen
 		 * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
-		 * memcmp() alone below is suffcient, right?
+		 * memcmp() alone below is sufficient, right?
 		 */
 		 if ((first_word & htonl(0xF00FFFFF)) ||
-		    memcmp(&iph->nexthdr, &iph2->nexthdr,
-			   nlen - offsetof(struct ipv6hdr, nexthdr))) {
+		    !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+		    !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+		    *(u16 *)&iph->nexthdr != *(u16 *)&iph2->nexthdr) {
+not_same_flow:
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
+		if (unlikely(nlen > sizeof(struct ipv6hdr))) {
+			if (memcmp(iph + 1, iph2 + 1,
+				   nlen - sizeof(struct ipv6hdr)))
+				goto not_same_flow;
+		}
 		/* flush if Traffic Class fields are different */
 		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
 		NAPI_GRO_CB(p)->flush |= flush;
-- 
2.19.1.930.g4563a0d9d0-goog

^ permalink raw reply related

* [PATCH v4 1/3] net: emac: implement 802.1Q VLAN TX tagging support
From: Christian Lamparter @ 2018-11-06 22:27 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Ivan Mikhaylov, Florian Fainelli

As per' APM82181 Embedded Processor User Manual 26.1 EMAC Features:
VLAN:
 - Support for VLAN tag ID in compliance with IEEE 802.3ac.
 - VLAN tag insertion or replacement for transmit packets

This patch completes the missing code for the VLAN tx tagging
support, as the the EMAC_MR1_VLE was already enabled.

Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 32 ++++++++++++++++++++++++----
 drivers/net/ethernet/ibm/emac/core.h |  6 +++++-
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 760b2ad8e295..be560f9031f4 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -37,6 +37,7 @@
 #include <linux/ethtool.h>
 #include <linux/mii.h>
 #include <linux/bitops.h>
+#include <linux/if_vlan.h>
 #include <linux/workqueue.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -674,7 +675,7 @@ static int emac_configure(struct emac_instance *dev)
 		 ndev->dev_addr[5]);
 
 	/* VLAN Tag Protocol ID */
-	out_be32(&p->vtpid, 0x8100);
+	out_be32(&p->vtpid, ETH_P_8021Q);
 
 	/* Receive mode register */
 	r = emac_iff2rmr(ndev);
@@ -1435,6 +1436,22 @@ static inline netdev_tx_t emac_xmit_finish(struct emac_instance *dev, int len)
 	return NETDEV_TX_OK;
 }
 
+static inline u16 emac_tx_vlan(struct emac_instance *dev, struct sk_buff *skb)
+{
+	/* Handle VLAN TPID and TCI insert if this is a VLAN skb */
+	if (emac_has_feature(dev, EMAC_FTR_HAS_VLAN_CTAG_TX) &&
+	    skb_vlan_tag_present(skb)) {
+		struct emac_regs __iomem *p = dev->emacp;
+
+		/* update the VLAN TCI */
+		out_be32(&p->vtci, (u32)skb_vlan_tag_get(skb));
+
+		/* Insert VLAN tag */
+		return EMAC_TX_CTRL_IVT;
+	}
+	return 0;
+}
+
 /* Tx lock BH */
 static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
@@ -1443,7 +1460,7 @@ static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	int slot;
 
 	u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY |
-	    MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb);
+	    MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb) | emac_tx_vlan(dev, skb);
 
 	slot = dev->tx_slot++;
 	if (dev->tx_slot == NUM_TX_BUFF) {
@@ -1518,7 +1535,7 @@ emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 		goto stop_queue;
 
 	ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY |
-	    emac_tx_csum(dev, skb);
+	    emac_tx_csum(dev, skb) | emac_tx_vlan(dev, skb);
 	slot = dev->tx_slot;
 
 	/* skb data */
@@ -2891,7 +2908,8 @@ static int emac_init_config(struct emac_instance *dev)
 		if (of_device_is_compatible(np, "ibm,emac-apm821xx")) {
 			dev->features |= (EMAC_APM821XX_REQ_JUMBO_FRAME_SIZE |
 					  EMAC_FTR_APM821XX_NO_HALF_DUPLEX |
-					  EMAC_FTR_460EX_PHY_CLK_FIX);
+					  EMAC_FTR_460EX_PHY_CLK_FIX |
+					  EMAC_FTR_HAS_VLAN_CTAG_TX);
 		}
 	} else if (of_device_is_compatible(np, "ibm,emac4")) {
 		dev->features |= EMAC_FTR_EMAC4;
@@ -3148,6 +3166,12 @@ static int emac_probe(struct platform_device *ofdev)
 
 	if (dev->tah_dev) {
 		ndev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG;
+
+		if (emac_has_feature(dev, EMAC_FTR_HAS_VLAN_CTAG_TX)) {
+			ndev->vlan_features |= ndev->hw_features;
+			ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
+		}
+
 		ndev->features |= ndev->hw_features | NETIF_F_RXCSUM;
 	}
 	ndev->watchdog_timeo = 5 * HZ;
diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 84caa4a3fc52..8d84d439168c 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -334,6 +334,8 @@ struct emac_instance {
  * APM821xx does not support Half Duplex mode
  */
 #define EMAC_FTR_APM821XX_NO_HALF_DUPLEX	0x00001000
+/* EMAC can insert 802.1Q tag */
+#define EMAC_FTR_HAS_VLAN_CTAG_TX		0x00002000
 
 /* Right now, we don't quite handle the always/possible masks on the
  * most optimal way as we don't have a way to say something like
@@ -363,7 +365,9 @@ enum {
 	EMAC_FTR_460EX_PHY_CLK_FIX |
 	EMAC_FTR_440EP_PHY_CLK_FIX |
 	EMAC_APM821XX_REQ_JUMBO_FRAME_SIZE |
-	EMAC_FTR_APM821XX_NO_HALF_DUPLEX,
+	EMAC_FTR_APM821XX_NO_HALF_DUPLEX |
+	EMAC_FTR_HAS_VLAN_CTAG_TX |
+	0,
 };
 
 static inline int emac_has_feature(struct emac_instance *dev,
-- 
2.19.1

^ permalink raw reply related

* [PATCH v4 3/3] net: emac: remove IBM_EMAC_RX_SKB_HEADROOM
From: Christian Lamparter @ 2018-11-06 22:27 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Ivan Mikhaylov, Florian Fainelli
In-Reply-To: <f5cdfb446be8091490aace18bff4a9f5e5da8242.1541543231.git.chunkeey@gmail.com>

The EMAC driver had a custom IBM_EMAC_RX_SKB_HEADROOM
Kconfig option that reserved additional skb headroom for RX.
This patch removes the option and migrates the code
to use napi_alloc_skb() and netdev_alloc_skb_ip_align()
in its place.

Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
---
 drivers/net/ethernet/ibm/emac/Kconfig | 12 ------
 drivers/net/ethernet/ibm/emac/core.c  | 57 +++++++++++++++++++--------
 drivers/net/ethernet/ibm/emac/core.h  | 10 ++---
 3 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/Kconfig b/drivers/net/ethernet/ibm/emac/Kconfig
index 90d49191beb3..eacf7e141fdc 100644
--- a/drivers/net/ethernet/ibm/emac/Kconfig
+++ b/drivers/net/ethernet/ibm/emac/Kconfig
@@ -28,18 +28,6 @@ config IBM_EMAC_RX_COPY_THRESHOLD
 	depends on IBM_EMAC
 	default "256"
 
-config IBM_EMAC_RX_SKB_HEADROOM
-	int "Additional RX skb headroom (bytes)"
-	depends on IBM_EMAC
-	default "0"
-	help
-	  Additional receive skb headroom. Note, that driver
-	  will always reserve at least 2 bytes to make IP header
-	  aligned, so usually there is no need to add any additional
-	  headroom.
-
-	  If unsure, set to 0.
-
 config IBM_EMAC_DEBUG
 	bool "Debugging"
 	depends on IBM_EMAC
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 80aafd7552aa..266b6614125b 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -1075,7 +1075,9 @@ static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu)
 
 	/* Second pass, allocate new skbs */
 	for (i = 0; i < NUM_RX_BUFF; ++i) {
-		struct sk_buff *skb = alloc_skb(rx_skb_size, GFP_ATOMIC);
+		struct sk_buff *skb;
+
+		skb = netdev_alloc_skb_ip_align(dev->ndev, rx_skb_size);
 		if (!skb) {
 			ret = -ENOMEM;
 			goto oom;
@@ -1084,7 +1086,6 @@ static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu)
 		BUG_ON(!dev->rx_skb[i]);
 		dev_kfree_skb(dev->rx_skb[i]);
 
-		skb_reserve(skb, EMAC_RX_SKB_HEADROOM + 2);
 		dev->rx_desc[i].data_ptr =
 		    dma_map_single(&dev->ofdev->dev, skb->data - 2, rx_sync_size,
 				   DMA_FROM_DEVICE) + 2;
@@ -1205,20 +1206,18 @@ static void emac_clean_rx_ring(struct emac_instance *dev)
 	}
 }
 
-static inline int emac_alloc_rx_skb(struct emac_instance *dev, int slot,
-				    gfp_t flags)
+static inline int
+__emac_prepare_rx_skb(struct sk_buff *skb, struct emac_instance *dev, int slot)
 {
-	struct sk_buff *skb = alloc_skb(dev->rx_skb_size, flags);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
 	dev->rx_skb[slot] = skb;
 	dev->rx_desc[slot].data_len = 0;
 
-	skb_reserve(skb, EMAC_RX_SKB_HEADROOM + 2);
 	dev->rx_desc[slot].data_ptr =
-	    dma_map_single(&dev->ofdev->dev, skb->data - 2, dev->rx_sync_size,
-			   DMA_FROM_DEVICE) + 2;
+	    dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
+			   dev->rx_sync_size, DMA_FROM_DEVICE) + NET_IP_ALIGN;
 	wmb();
 	dev->rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY |
 	    (slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0);
@@ -1226,6 +1225,27 @@ static inline int emac_alloc_rx_skb(struct emac_instance *dev, int slot,
 	return 0;
 }
 
+static inline int
+emac_alloc_rx_skb(struct emac_instance *dev, int slot)
+{
+	struct sk_buff *skb;
+
+	skb = __netdev_alloc_skb_ip_align(dev->ndev, dev->rx_skb_size,
+					  GFP_KERNEL);
+
+	return __emac_prepare_rx_skb(skb, dev, slot);
+}
+
+static inline int
+emac_alloc_rx_skb_napi(struct emac_instance *dev, int slot)
+{
+	struct sk_buff *skb;
+
+	skb = napi_alloc_skb(&dev->mal->napi, dev->rx_skb_size);
+
+	return __emac_prepare_rx_skb(skb, dev, slot);
+}
+
 static void emac_print_link_status(struct emac_instance *dev)
 {
 	if (netif_carrier_ok(dev->ndev))
@@ -1256,7 +1276,7 @@ static int emac_open(struct net_device *ndev)
 
 	/* Allocate RX ring */
 	for (i = 0; i < NUM_RX_BUFF; ++i)
-		if (emac_alloc_rx_skb(dev, i, GFP_KERNEL)) {
+		if (emac_alloc_rx_skb(dev, i)) {
 			printk(KERN_ERR "%s: failed to allocate RX ring\n",
 			       ndev->name);
 			goto oom;
@@ -1779,8 +1799,9 @@ static inline void emac_recycle_rx_skb(struct emac_instance *dev, int slot,
 	DBG2(dev, "recycle %d %d" NL, slot, len);
 
 	if (len)
-		dma_map_single(&dev->ofdev->dev, skb->data - 2,
-			       EMAC_DMA_ALIGN(len + 2), DMA_FROM_DEVICE);
+		dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
+			       SKB_DATA_ALIGN(len + NET_IP_ALIGN),
+			       DMA_FROM_DEVICE);
 
 	dev->rx_desc[slot].data_len = 0;
 	wmb();
@@ -1888,16 +1909,18 @@ static int emac_poll_rx(void *param, int budget)
 		}
 
 		if (len && len < EMAC_RX_COPY_THRESH) {
-			struct sk_buff *copy_skb =
-			    alloc_skb(len + EMAC_RX_SKB_HEADROOM + 2, GFP_ATOMIC);
+			struct sk_buff *copy_skb;
+
+			copy_skb = napi_alloc_skb(&dev->mal->napi, len);
 			if (unlikely(!copy_skb))
 				goto oom;
 
-			skb_reserve(copy_skb, EMAC_RX_SKB_HEADROOM + 2);
-			memcpy(copy_skb->data - 2, skb->data - 2, len + 2);
+			memcpy(copy_skb->data - NET_IP_ALIGN,
+			       skb->data - NET_IP_ALIGN,
+			       len + NET_IP_ALIGN);
 			emac_recycle_rx_skb(dev, slot, len);
 			skb = copy_skb;
-		} else if (unlikely(emac_alloc_rx_skb(dev, slot, GFP_ATOMIC)))
+		} else if (unlikely(emac_alloc_rx_skb_napi(dev, slot)))
 			goto oom;
 
 		skb_put(skb, len);
@@ -1918,7 +1941,7 @@ static int emac_poll_rx(void *param, int budget)
 	sg:
 		if (ctrl & MAL_RX_CTRL_FIRST) {
 			BUG_ON(dev->rx_sg_skb);
-			if (unlikely(emac_alloc_rx_skb(dev, slot, GFP_ATOMIC))) {
+			if (unlikely(emac_alloc_rx_skb_napi(dev, slot))) {
 				DBG(dev, "rx OOM %d" NL, slot);
 				++dev->estats.rx_dropped_oom;
 				emac_recycle_rx_skb(dev, slot, 0);
diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 0bcfe952a3cf..0faeb7c7e958 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -68,22 +68,18 @@ static inline int emac_rx_size(int mtu)
 		return mal_rx_size(ETH_DATA_LEN + EMAC_MTU_OVERHEAD);
 }
 
-#define EMAC_DMA_ALIGN(x)		ALIGN((x), dma_get_cache_alignment())
-
-#define EMAC_RX_SKB_HEADROOM		\
-	EMAC_DMA_ALIGN(CONFIG_IBM_EMAC_RX_SKB_HEADROOM)
-
 /* Size of RX skb for the given MTU */
 static inline int emac_rx_skb_size(int mtu)
 {
 	int size = max(mtu + EMAC_MTU_OVERHEAD, emac_rx_size(mtu));
-	return EMAC_DMA_ALIGN(size + 2) + EMAC_RX_SKB_HEADROOM;
+
+	return SKB_DATA_ALIGN(size + NET_SKB_PAD + NET_IP_ALIGN);
 }
 
 /* RX DMA sync size */
 static inline int emac_rx_sync_size(int mtu)
 {
-	return EMAC_DMA_ALIGN(emac_rx_size(mtu) + 2);
+	return SKB_DATA_ALIGN(emac_rx_size(mtu) + NET_IP_ALIGN);
 }
 
 /* Driver statistcs is split into two parts to make it more cache friendly:
-- 
2.19.1

^ permalink raw reply related

* [PATCH v4 2/3] net: emac: implement TCP segmentation offload (TSO)
From: Christian Lamparter @ 2018-11-06 22:27 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Ivan Mikhaylov, Florian Fainelli
In-Reply-To: <0f8f149bb526b911813cfe555f99ef00db2f1387.1541543231.git.chunkeey@gmail.com>

This patch enables TSO(v4) hw feature for emac driver.
As atleast the APM82181's TCP/IP acceleration hardware
controller (TAH) provides TCP segmentation support in
the transmit path.

Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 112 ++++++++++++++++++++++++++-
 drivers/net/ethernet/ibm/emac/core.h |   7 ++
 drivers/net/ethernet/ibm/emac/emac.h |   7 ++
 drivers/net/ethernet/ibm/emac/tah.c  |  22 +++++-
 drivers/net/ethernet/ibm/emac/tah.h  |   2 +
 5 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index be560f9031f4..80aafd7552aa 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -38,6 +38,9 @@
 #include <linux/mii.h>
 #include <linux/bitops.h>
 #include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
 #include <linux/workqueue.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -1118,6 +1121,32 @@ static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu)
 	return ret;
 }
 
+/* Restriction applied for the segmentation size
+ * to use HW segmentation offload feature. the size
+ * of the segment must not be less than 168 bytes for
+ * DIX formatted segments, or 176 bytes for
+ * IEEE formatted segments. However based on actual
+ * tests any MTU less than 416 causes excessive retries
+ * due to TX FIFO underruns.
+ */
+const u32 tah_ss[TAH_NO_SSR] = { 1500, 1344, 1152, 960, 768, 416 };
+
+/* look-up matching segment size for the given mtu */
+static void emac_find_tso_ss_for_mtu(struct emac_instance *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tah_ss); i++) {
+		if (tah_ss[i] <= dev->ndev->mtu)
+			break;
+	}
+	/* if no matching segment size is found, set the tso_ss_mtu_start
+	 * variable anyway. This will cause the emac_tx_tso to skip straight
+	 * to the software fallback.
+	 */
+	dev->tso_ss_mtu_start = i;
+}
+
 /* Process ctx, rtnl_lock semaphore */
 static int emac_change_mtu(struct net_device *ndev, int new_mtu)
 {
@@ -1134,6 +1163,7 @@ static int emac_change_mtu(struct net_device *ndev, int new_mtu)
 
 	if (!ret) {
 		ndev->mtu = new_mtu;
+		emac_find_tso_ss_for_mtu(dev);
 		dev->rx_skb_size = emac_rx_skb_size(new_mtu);
 		dev->rx_sync_size = emac_rx_sync_size(new_mtu);
 	}
@@ -1410,6 +1440,33 @@ static inline u16 emac_tx_csum(struct emac_instance *dev,
 	return 0;
 }
 
+static int emac_tx_tso(struct emac_instance *dev, struct sk_buff *skb,
+		       u16 *ctrl)
+{
+	if (emac_has_feature(dev, EMAC_FTR_TAH_HAS_TSO) && skb_is_gso(skb) &&
+	    !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+		u32 seg_size = 0, i;
+
+		/* Get the MTU */
+		seg_size = skb_shinfo(skb)->gso_size + tcp_hdrlen(skb) +
+			   skb_network_header_len(skb);
+
+		for (i = dev->tso_ss_mtu_start; i < ARRAY_SIZE(tah_ss); i++) {
+			if (tah_ss[i] > seg_size)
+				continue;
+
+			*ctrl |= EMAC_TX_CTRL_TAH_SSR(i);
+			return 0;
+		}
+
+		/* none found fall back to software */
+		return -EINVAL;
+	}
+
+	*ctrl |= emac_tx_csum(dev, skb);
+	return 0;
+}
+
 static inline netdev_tx_t emac_xmit_finish(struct emac_instance *dev, int len)
 {
 	struct emac_regs __iomem *p = dev->emacp;
@@ -1452,6 +1509,46 @@ static inline u16 emac_tx_vlan(struct emac_instance *dev, struct sk_buff *skb)
 	return 0;
 }
 
+static netdev_tx_t
+emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev);
+
+static int
+emac_sw_tso(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct emac_instance *dev = netdev_priv(ndev);
+	struct sk_buff *segs, *curr;
+	unsigned int i, frag_slots;
+
+	/* make sure to not overflow the tx ring */
+	frag_slots = dev->tx_cnt;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+
+		frag_slots += mal_tx_chunks(skb_frag_size(frag));
+
+		if (frag_slots >= NUM_TX_BUFF)
+			return -ENOSPC;
+	};
+
+	segs = skb_gso_segment(skb, ndev->features &
+					~(NETIF_F_TSO | NETIF_F_TSO6));
+	if (IS_ERR_OR_NULL(segs)) {
+		++dev->estats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	} else {
+		while (segs) {
+			curr = segs;
+			segs = curr->next;
+			curr->next = NULL;
+
+			emac_start_xmit_sg(curr, ndev);
+		}
+		dev_consume_skb_any(skb);
+	}
+
+	return 0;
+}
+
 /* Tx lock BH */
 static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
@@ -1535,7 +1632,12 @@ emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 		goto stop_queue;
 
 	ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY |
-	    emac_tx_csum(dev, skb) | emac_tx_vlan(dev, skb);
+	    emac_tx_vlan(dev, skb);
+	if (emac_tx_tso(dev, skb, &ctrl)) {
+		if (emac_sw_tso(skb, ndev))
+			goto stop_queue;
+	}
+
 	slot = dev->tx_slot;
 
 	/* skb data */
@@ -2946,6 +3048,9 @@ static int emac_init_config(struct emac_instance *dev)
 	if (dev->tah_ph != 0) {
 #ifdef CONFIG_IBM_EMAC_TAH
 		dev->features |= EMAC_FTR_HAS_TAH;
+
+		if (of_device_is_compatible(np, "ibm,emac-apm821xx"))
+			dev->features |= EMAC_FTR_TAH_HAS_TSO;
 #else
 		printk(KERN_ERR "%pOF: TAH support not enabled !\n", np);
 		return -ENXIO;
@@ -3113,6 +3218,8 @@ static int emac_probe(struct platform_device *ofdev)
 	}
 	dev->rx_skb_size = emac_rx_skb_size(ndev->mtu);
 	dev->rx_sync_size = emac_rx_sync_size(ndev->mtu);
+	ndev->gso_max_segs = NUM_TX_BUFF / 2;
+	emac_find_tso_ss_for_mtu(dev);
 
 	/* Get pointers to BD rings */
 	dev->tx_desc =
@@ -3167,6 +3274,9 @@ static int emac_probe(struct platform_device *ofdev)
 	if (dev->tah_dev) {
 		ndev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG;
 
+		if (emac_has_feature(dev, EMAC_FTR_TAH_HAS_TSO))
+			ndev->hw_features |= NETIF_F_TSO;
+
 		if (emac_has_feature(dev, EMAC_FTR_HAS_VLAN_CTAG_TX)) {
 			ndev->vlan_features |= ndev->hw_features;
 			ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 8d84d439168c..0bcfe952a3cf 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -245,6 +245,9 @@ struct emac_instance {
 	u32				xaht_slots_shift;
 	u32				xaht_width_shift;
 
+	/* TAH TSO start index */
+	int				tso_ss_mtu_start;
+
 	/* Descriptor management
 	 */
 	struct mal_descriptor		*tx_desc;
@@ -336,6 +339,8 @@ struct emac_instance {
 #define EMAC_FTR_APM821XX_NO_HALF_DUPLEX	0x00001000
 /* EMAC can insert 802.1Q tag */
 #define EMAC_FTR_HAS_VLAN_CTAG_TX		0x00002000
+/* TAH can do TCP segmentation offload */
+#define EMAC_FTR_TAH_HAS_TSO			0x00004000
 
 /* Right now, we don't quite handle the always/possible masks on the
  * most optimal way as we don't have a way to say something like
@@ -352,6 +357,8 @@ enum {
 #endif
 #ifdef CONFIG_IBM_EMAC_TAH
 	    EMAC_FTR_HAS_TAH	|
+	    EMAC_FTR_TAH_HAS_TSO	|
+
 #endif
 #ifdef CONFIG_IBM_EMAC_ZMII
 	    EMAC_FTR_HAS_ZMII	|
diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h
index e2f80cca9bed..833967aceb2f 100644
--- a/drivers/net/ethernet/ibm/emac/emac.h
+++ b/drivers/net/ethernet/ibm/emac/emac.h
@@ -266,6 +266,13 @@ struct emac_regs {
 #define EMAC_TX_CTRL_IVT		0x0020
 #define EMAC_TX_CTRL_RVT		0x0010
 #define EMAC_TX_CTRL_TAH_CSUM		0x000e
+#define EMAC_TX_CTRL_TAH_SSR(idx)	(((idx) + 1) << 1)
+#define EMAC_TX_CTRL_TAH_SSR5		0x000c
+#define EMAC_TX_CTRL_TAH_SSR4		0x000a
+#define EMAC_TX_CTRL_TAH_SSR3		0x0008
+#define EMAC_TX_CTRL_TAH_SSR2		0x0006
+#define EMAC_TX_CTRL_TAH_SSR1		0x0004
+#define EMAC_TX_CTRL_TAH_SSR0		0x0002
 
 /* EMAC specific TX descriptor status fields (read access) */
 #define EMAC_TX_ST_BFCS			0x0200
diff --git a/drivers/net/ethernet/ibm/emac/tah.c b/drivers/net/ethernet/ibm/emac/tah.c
index 9912456dca48..619c08ee22f7 100644
--- a/drivers/net/ethernet/ibm/emac/tah.c
+++ b/drivers/net/ethernet/ibm/emac/tah.c
@@ -45,6 +45,24 @@ void tah_detach(struct platform_device *ofdev, int channel)
 	mutex_unlock(&dev->lock);
 }
 
+static void tah_set_ssr(struct platform_device *ofdev)
+{
+	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
+	struct tah_regs __iomem *p = dev->base;
+	int i;
+
+	mutex_lock(&dev->lock);
+
+	for (i = 0; i < ARRAY_SIZE(tah_ss); i++) {
+		/* Segment size can be up to 16K, but needs
+		 * to be a multiple of 2 bytes
+		 */
+		out_be32(&p->ssr0 + i, (tah_ss[i] & 0x3ffc) << 16);
+	}
+
+	mutex_unlock(&dev->lock);
+}
+
 void tah_reset(struct platform_device *ofdev)
 {
 	struct tah_instance *dev = platform_get_drvdata(ofdev);
@@ -64,6 +82,8 @@ void tah_reset(struct platform_device *ofdev)
 	out_be32(&p->mr,
 		 TAH_MR_CVR | TAH_MR_ST_768 | TAH_MR_TFS_10KB | TAH_MR_DTFP |
 		 TAH_MR_DIG);
+
+	tah_set_ssr(ofdev);
 }
 
 int tah_get_regs_len(struct platform_device *ofdev)
@@ -118,7 +138,7 @@ static int tah_probe(struct platform_device *ofdev)
 
 	platform_set_drvdata(ofdev, dev);
 
-	/* Initialize TAH and enable IPv4 checksum verification, no TSO yet */
+	/* Initialize TAH and enable IPv4 checksum verification */
 	tah_reset(ofdev);
 
 	printk(KERN_INFO "TAH %pOF initialized\n", ofdev->dev.of_node);
diff --git a/drivers/net/ethernet/ibm/emac/tah.h b/drivers/net/ethernet/ibm/emac/tah.h
index 4d5f336f07b3..2cb0629f30e2 100644
--- a/drivers/net/ethernet/ibm/emac/tah.h
+++ b/drivers/net/ethernet/ibm/emac/tah.h
@@ -36,6 +36,8 @@ struct tah_regs {
 	u32 tsr;
 };
 
+#define TAH_NO_SSR	6
+extern const u32 tah_ss[TAH_NO_SSR];
 
 /* TAH device */
 struct tah_instance {
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH net-next] ipv6: gro: do not use slow memcmp() in ipv6_gro_receive()
From: David Miller @ 2018-11-06 22:41 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, eric.dumazet
In-Reply-To: <20181106222552.5423-1-edumazet@google.com>

From: Eric Dumazet <edumazet@google.com>
Date: Tue,  6 Nov 2018 14:25:52 -0800

> +		if (unlikely(nlen > sizeof(struct ipv6hdr))) {
> +			if (memcmp(iph + 1, iph2 + 1,
> +				   nlen - sizeof(struct ipv6hdr)))
> +				goto not_same_flow;
> +		}

Is this even possible?

	off = skb_gro_offset(skb);
	hlen = off + sizeof(*iph);
	iph = skb_gro_header_fast(skb, off);

off is some offset to the ipv6hdr in skb.  This is GRO's CB data_offset.

	skb_set_network_header(skb, off);
	skb_gro_pull(skb, sizeof(*iph));
	skb_set_transport_header(skb, skb_gro_offset(skb));

Set network header to location of iph in SKB.

GRO pull causes an incremebt of data_offset by sizeof(*iph) bytes.

Set transport header to new data_offset value.

	nlen = skb_network_header_len(skb);

This is transport_header - network_header.

>From what I can see, it is impossible for this to take on any value
other than sizeof(*ipv6hdr).

If you agree, please let's get rid of nlen and this useless code, and
replace with sizeof(*ipv6hdr) as needed.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 1/3] devlink: Add fw_version_check generic parameter
From: Jakub Kicinski @ 2018-11-06 22:47 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Ido Schimmel, netdev@vger.kernel.org, davem@davemloft.net,
	Jiri Pirko, Shalom Toledo, Moshe Shemesh, dsahern@gmail.com,
	andrew@lunn.ch, f.fainelli@gmail.com, mlxsw
In-Reply-To: <20181106203751.GA30851@splinter.mtl.com>

On Tue, 6 Nov 2018 22:37:51 +0200, Ido Schimmel wrote:
> On Tue, Nov 06, 2018 at 12:19:13PM -0800, Jakub Kicinski wrote:
> > On Tue, 6 Nov 2018 20:05:00 +0000, Ido Schimmel wrote:  
> > > From: Shalom Toledo <shalomt@mellanox.com>
> > > 
> > > Many drivers checking the device's firmware version during the
> > > initialization flow and flashing a compatible version if the current
> > > version is not.
> > > 
> > > fw_version_check gives the ability to skip this check which allows to run
> > > the device with a different firmware version than required by the driver
> > > for testing and/or debugging purposes.
> > > 
> > > Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
> > > Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> > > Signed-off-by: Ido Schimmel <idosch@mellanox.com>  
> > 
> > The documentation is missing, so it's hard to comment on the definition
> > of the parameter...    
> 
> I assume you mean Documentation/networking/devlink-params.txt ?

Yes

> > We have a FW loading policy for NFP, too, so it'd be good to see if we
> > can find a common ground.  
> 
> If the parameter is set, then device runs with whatever firmware version
> was last flashed (via ethtool, for example). Otherwise, the driver will
> flash a version according to its policy. In mlxsw, it is a specific
> version.
> 
> Will that work for you?

Our FW is always backward compatible so there is no need to downgrade.

What we have is this more along these lines: there are two images one
on disk and second in the flash.  The FW loading policy can decide
which of those should be preferred, or should the versions be compared
and the newer one win (default).  But we don't flash the newer FW, just
potentially load it from disk today.

I'm not sure whether 'fw_version_check' describes the general behaviour
of not updating the FW in flash.  The policy of updating the FW in the
flash if the one on disk is newer seems to be something we could adopt
as well.  Can we come up with a more general parameter which could
select FW loading policy that'd for both cases?

Would values like these make any sense to you?
 - driver preferred (your default behaviour, we don't support since
   driver doesn't care);
 - newest (our default, device compares images and picks newer);
 - always disk (always run with what's on the disk, regardless of
   versions);
 - always flash (always run with what's already in flash, don't look at
   disk);

Separate bool parameter 'fw_flash_auto_update' would decide whether the
selected FW should be flashed to the device (always true for you AFAIU).

Let me know if that makes sense, it would be nice if we could converge
on a common solution, or at least name our parameters sufficiently
distinctly to avoid confusion :)

^ permalink raw reply

* Re: [PATCH net-next] ipv6: gro: do not use slow memcmp() in ipv6_gro_receive()
From: Eric Dumazet @ 2018-11-06 22:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Eric Dumazet
In-Reply-To: <20181106.144114.1805194057718658471.davem@davemloft.net>

On Tue, Nov 6, 2018 at 2:41 PM David Miller <davem@davemloft.net> wrote:
>
> From: Eric Dumazet <edumazet@google.com>
> Date: Tue,  6 Nov 2018 14:25:52 -0800
>
> > +             if (unlikely(nlen > sizeof(struct ipv6hdr))) {
> > +                     if (memcmp(iph + 1, iph2 + 1,
> > +                                nlen - sizeof(struct ipv6hdr)))
> > +                             goto not_same_flow;
> > +             }
>
> Is this even possible?

I believe that nlen can be indeed > sizeof(struct ipv6hdr) in presence
of exthdrs,
eg if ipv6_gso_pull_exthdrs() had to be called (line 201)

 I admit I have not checked if this was actually possible.


>
>         off = skb_gro_offset(skb);
>         hlen = off + sizeof(*iph);
>         iph = skb_gro_header_fast(skb, off);
>
> off is some offset to the ipv6hdr in skb.  This is GRO's CB data_offset.
>
>         skb_set_network_header(skb, off);
>         skb_gro_pull(skb, sizeof(*iph));
>         skb_set_transport_header(skb, skb_gro_offset(skb));
>
> Set network header to location of iph in SKB.
>
> GRO pull causes an incremebt of data_offset by sizeof(*iph) bytes.
>
> Set transport header to new data_offset value.
>
>         nlen = skb_network_header_len(skb);
>
> This is transport_header - network_header.
>
> From what I can see, it is impossible for this to take on any value
> other than sizeof(*ipv6hdr).
>
> If you agree, please let's get rid of nlen and this useless code, and
> replace with sizeof(*ipv6hdr) as needed.
>
> Thanks.

^ permalink raw reply

* Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO
From: Edward Cree @ 2018-11-06 22:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Martin Lau, Yonghong Song, Alexei Starovoitov,
	daniel@iogearbox.net, netdev@vger.kernel.org, Kernel Team
In-Reply-To: <20181106215633.gi6v4fi2llmqo2rn@ast-mbp>

On 06/11/18 21:56, Alexei Starovoitov wrote:
> that looks very weird to me. Why split func name from argument names?
> The 'function name' as seen by the BTF may not be the symbol name
> as seen in elf file.
The symbol name will be in the symbol table, which is not the same
 thing as the functions table in BTF that I'm proposing.  (They do
 look a little similar as I included an insn_idx for functions that
 partially duplicates the offset given in the symbol table.  But
 that's necessary precisely for the reason you mention, that the
 function name != the symbol name in general.)
"Splitting" func name from argument names is partly to potentially
 save space — if we'd had "int bar(int x)" instead then 'bar' could
 share its type record with 'foo'.  And partly just because the
 name of the function itself is no more part of its type than the
 name of an integer variable is part of the integer's type.
(Whereas names of parameters are like names of struct members:
 while they are not part of the 'pure type' from a language
 perspective, they are part of the type from the perspective of
 debugging, which is why they belong in the BTF type record.)

> There are C, bpftrace, p4 and python frontends. These languages
> should be free to put into BTF KIND_FUNC name that makes sense
> from the language point of view.
I'm paying attention to BTF because I'm adding support for it into
 my ebpf_asm.  Don't you think I *know* that frontends for BPF are
 more than just C?

>>  and in the 'variables' section we might have
>> 1 "quux" type=1 where=stack func=1 offset=-8
> that doesn't work. stack slots can be reused by compiler.
And who says that there can't be multiple records pointing to the
 same stack slot with different types & names?

> Instead we will annotate every load/store with btf type id.
That's certainly more useful; but I think most useful of all is to
 have *both* (though the stack slot types should be optional).

> The global variables for given .c file will look like single KIND_STRUCT
That's exactly the kind of superficially-clever but nasty hack
 that results from the continued insistence on conflating types
 and instances (objects).  In the long run it will make
 maintenance harder, and frustrate new features owing to the need
 to find new hacks to shoehorn them into the same model.
Instead there should be entries for the globals in something like
 the variables table I mentioned,
2 "fred" type=1 where=global func=0 offset=8
 in which 'func' is unused and 'offset' gives offset in .bss.
 'where' might also include indication of whether it's static.

Then for linkage you can extend this with index of which file it
 came from.

But maybe discussing global variables is a bit premature as eBPF
 doesn't have any such thing yet.

> yes we do see these things differently.
> To us function name is the debug info that fits well into BTF description.
> Whereas you see the function name part of function declaration
> as something 'entirely different'.
I'm not saying that the function name is 'entirely different'
 to the rest of the type.  (Though I do think it doesn't
 belong in the type, that's a weaker and contingent point.)
I'm saying that the *function* is entirely different to its
 *type*.  It's a category error to conflate them:
    f: x ↦ x + 1
 is a function.
    int → int
 is a type, and specifically the type of the object named "f".
(And the nature of mathematical notation for functions happens
 to put the name 'x' in the former, whereas we are putting the
 parameter name in the latter, but that's irrelevant.)
Similarly, "1" is an integer, but "integer" is a type, and is
 not itself an integer, while "1" is not a type.  They are at
 different meta-levels.

-Ed

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox