Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 6/9] selftests: pmtu: Add pmtu_vti4_exception test
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

This test checks that PMTU exceptions are created only when
needed on IPv4 routes with vti and xfrm, and their PMTU value is
checked as well.

We can't adopt the same approach as test_pmtu_vti6_exception()
here, because on IPv4 administrative MTU changes won't be
reflected directly on PMTU.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 84 ++++++++++++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 67b77f9108ee..336b8545c4bd 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -6,6 +6,14 @@
 #
 # Tests currently implemented:
 #
+# - pmtu_vti4_exception
+#	Set up vti tunnel on top of veth, with xfrm states and policies, in two
+#	namespaces with matching endpoints. Check that route exception is not
+#	created if link layer MTU is not exceeded, then exceed it and check that
+#	exception is created with the expected PMTU. The approach described
+#	below for IPv6 doesn't apply here, because, on IPv4, administrative MTU
+#	changes alone won't affect PMTU
+#
 # - pmtu_vti6_exception
 #	Set up vti6 tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is
@@ -22,7 +30,8 @@
 # - pmtu_vti6_default_mtu
 #	Same as above, for IPv6
 
-tests="pmtu_vti6_exception pmtu_vti4_default_mtu pmtu_vti6_default_mtu"
+tests="pmtu_vti4_exception pmtu_vti6_exception
+       pmtu_vti4_default_mtu pmtu_vti6_default_mtu"
 
 NS_A="ns-$(mktemp -u XXXXXX)"
 NS_B="ns-$(mktemp -u XXXXXX)"
@@ -103,19 +112,33 @@ setup_vti6() {
 }
 
 setup_xfrm() {
-	${ns_a} ip -6 xfrm state add src ${veth6_a_addr} dst ${veth6_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 0
-	${ns_a} ip -6 xfrm state add src ${veth6_b_addr} dst ${veth6_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
-	${ns_a} ip -6 xfrm policy add dir out mark 10 tmpl src ${veth6_a_addr} dst ${veth6_b_addr} proto esp mode tunnel
-	${ns_a} ip -6 xfrm policy add dir in mark 10 tmpl src ${veth6_b_addr} dst ${veth6_a_addr} proto esp mode tunnel
+	proto=${1}
+	veth_a_addr="${2}"
+	veth_b_addr="${3}"
 
-	${ns_b} ip -6 xfrm state add src ${veth6_a_addr} dst ${veth6_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
-	${ns_b} ip -6 xfrm state add src ${veth6_b_addr} dst ${veth6_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
-	${ns_b} ip -6 xfrm policy add dir out mark 10 tmpl src ${veth6_b_addr} dst ${veth6_a_addr} proto esp mode tunnel
-	${ns_b} ip -6 xfrm policy add dir in mark 10 tmpl src ${veth6_a_addr} dst ${veth6_b_addr} proto esp mode tunnel
+	${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 0
+	${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+	${ns_a} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
+	${ns_a} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
+
+	${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+	${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+	${ns_b} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
+	${ns_b} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
 
 	return 1
 }
 
+setup_xfrm4() {
+	setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr}
+	return $?
+}
+
+setup_xfrm6() {
+	setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr}
+	return $?
+}
+
 setup() {
 	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return 0
 
@@ -180,8 +203,49 @@ route_get_dst_pmtu_from_exception() {
 	mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
 }
 
+test_pmtu_vti4_exception() {
+	setup namespaces veth vti4 xfrm4 && return 2
+
+	veth_mtu=1500
+	vti_mtu=$((veth_mtu - 20))
+
+	#                                SPI   SN   IV  ICV   pad length   next header
+	esp_payload_rfc4106=$((vti_mtu - 4   - 4  - 8 - 16  - 1          - 1))
+	ping_payload=$((esp_payload_rfc4106 - 28))
+
+	mtu "${ns_a}" veth_a ${veth_mtu}
+	mtu "${ns_b}" veth_b ${veth_mtu}
+	mtu "${ns_a}" vti4_a ${vti_mtu}
+	mtu "${ns_b}" vti4_b ${vti_mtu}
+
+	# Send DF packet without exceeding link layer MTU, check that no
+	# exception is created
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${vti4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	if [ "${pmtu}" != "" ]; then
+		echo "  unexpected exception created with PMTU ${pmtu} for IP payload length ${esp_payload_rfc4106}"
+		return 0
+	fi
+
+	# Now exceed link layer MTU by one byte, check that exception is created
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${vti4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	if [ "${pmtu}" = "" ]; then
+		echo "  exception not created for IP payload length $((esp_payload_rfc4106 + 1))"
+		return 0
+	fi
+
+	# ...with the right PMTU value
+	if [ ${pmtu} -ne ${esp_payload_rfc4106} ]; then
+		echo "  wrong PMTU ${pmtu} in exception, expected: ${esp_payload_rfc4106}"
+		return 0
+	fi
+
+	return 1
+}
+
 test_pmtu_vti6_exception() {
-	setup namespaces veth vti6 xfrm && return 2
+	setup namespaces veth vti6 xfrm6 && return 2
 
 	# Create route exception by exceeding link layer MTU
 	mtu "${ns_a}" veth_a 4000
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 5/9] selftests: pmtu: Add pmtu_vti6_default_mtu test
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

Same as pmtu_vti4_default_mtu, but on IPv6 with vti6.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 440d2defe592..67b77f9108ee 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -18,8 +18,11 @@
 #	endpoints. Check that MTU assigned to vti interface is the MTU of the
 #	lower layer (veth) minus additional lower layer headers (zero, for veth)
 #	minus IPv4 header length
+#
+# - pmtu_vti6_default_mtu
+#	Same as above, for IPv6
 
-tests="pmtu_vti6_exception pmtu_vti4_default_mtu"
+tests="pmtu_vti6_exception pmtu_vti4_default_mtu pmtu_vti6_default_mtu"
 
 NS_A="ns-$(mktemp -u XXXXXX)"
 NS_B="ns-$(mktemp -u XXXXXX)"
@@ -225,6 +228,20 @@ test_pmtu_vti4_default_mtu() {
 	return 1
 }
 
+test_pmtu_vti6_default_mtu() {
+	setup namespaces veth vti6 && return 2
+
+	# Check that MTU of vti device is MTU of veth minus IPv6 header length
+	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
+	vti6_mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+	if [ $((veth_mtu - vti6_mtu)) -ne 40 ]; then
+		echo "  vti MTU ${vti6_mtu} is not veth MTU ${veth_mtu} minus IPv6 header length"
+		return 0
+	fi
+
+	return 1
+}
+
 trap cleanup EXIT
 
 exitcode=0
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 4/9] selftests: pmtu: Add pmtu_vti4_default_mtu test
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

This test checks that the MTU assigned by default to a vti
(IPv4) interface created on top of veth is simply veth's MTU
minus the length of the encapsulated IPv4 header.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 89 +++++++++++++++++++++++++++++++------
 1 file changed, 76 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index feebb6fcff21..440d2defe592 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 #
-# Check that route PMTU values match expectations
+# Check that route PMTU values match expectations, and that initial device MTU
+# values are assigned correctly
 #
 # Tests currently implemented:
 #
@@ -11,18 +12,30 @@
 #	created by exceeding link layer MTU with ping to other endpoint. Then
 #	decrease and increase MTU of tunnel, checking that route exception PMTU
 #	changes accordingly
+#
+# - pmtu_vti4_default_mtu
+#	Set up vti4 tunnel on top of veth, in two namespaces with matching
+#	endpoints. Check that MTU assigned to vti interface is the MTU of the
+#	lower layer (veth) minus additional lower layer headers (zero, for veth)
+#	minus IPv4 header length
 
-tests="pmtu_vti6_exception"
+tests="pmtu_vti6_exception pmtu_vti4_default_mtu"
 
 NS_A="ns-$(mktemp -u XXXXXX)"
 NS_B="ns-$(mktemp -u XXXXXX)"
 ns_a="ip netns exec ${NS_A}"
 ns_b="ip netns exec ${NS_B}"
 
+veth4_a_addr="192.168.1.1"
+veth4_b_addr="192.168.1.2"
+veth4_mask="24"
 veth6_a_addr="fd00:1::a"
 veth6_b_addr="fd00:1::b"
 veth6_mask="64"
 
+vti4_a_addr="192.168.2.1"
+vti4_b_addr="192.168.2.2"
+vti4_mask="24"
 vti6_a_addr="fd00:2::a"
 vti6_b_addr="fd00:2::b"
 vti6_mask="64"
@@ -40,6 +53,9 @@ setup_veth() {
 	${ns_a} ip link add veth_a type veth peer name veth_b || return 0
 	${ns_a} ip link set veth_b netns ${NS_B}
 
+	${ns_a} ip addr add ${veth4_a_addr}/${veth4_mask} dev veth_a
+	${ns_b} ip addr add ${veth4_b_addr}/${veth4_mask} dev veth_b
+
 	${ns_a} ip addr add ${veth6_a_addr}/${veth6_mask} dev veth_a
 	${ns_b} ip addr add ${veth6_b_addr}/${veth6_mask} dev veth_b
 
@@ -49,21 +65,40 @@ setup_veth() {
 	return 1
 }
 
-setup_vti6() {
-	${ns_a} ip link add vti_a type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10 || return 0
-	${ns_b} ip link add vti_b type vti6 local ${veth6_b_addr} remote ${veth6_a_addr} key 10
+setup_vti() {
+	proto=${1}
+	veth_a_addr="${2}"
+	veth_b_addr="${3}"
+	vti_a_addr="${4}"
+	vti_b_addr="${5}"
+	vti_mask=${6}
+
+	[ ${proto} -eq 6 ] && vti_type="vti6" || vti_type="vti"
 
-	${ns_a} ip addr add ${vti6_a_addr}/${vti6_mask} dev vti_a
-	${ns_b} ip addr add ${vti6_b_addr}/${vti6_mask} dev vti_b
+	${ns_a} ip link add vti${proto}_a type ${vti_type} local ${veth_a_addr} remote ${veth_b_addr} key 10 || return 0
+	${ns_b} ip link add vti${proto}_b type ${vti_type} local ${veth_b_addr} remote ${veth_a_addr} key 10
 
-	${ns_a} ip link set vti_a up
-	${ns_b} ip link set vti_b up
+	${ns_a} ip addr add ${vti_a_addr}/${vti_mask} dev vti${proto}_a
+	${ns_b} ip addr add ${vti_b_addr}/${vti_mask} dev vti${proto}_b
+
+	${ns_a} ip link set vti${proto}_a up
+	${ns_b} ip link set vti${proto}_b up
 
 	sleep 1
 
 	return 1
 }
 
+setup_vti4() {
+	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${vti4_a_addr} ${vti4_b_addr} ${vti4_mask}
+	return $?
+}
+
+setup_vti6() {
+	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${vti6_a_addr} ${vti6_b_addr} ${vti6_mask}
+	return $?
+}
+
 setup_xfrm() {
 	${ns_a} ip -6 xfrm state add src ${veth6_a_addr} dst ${veth6_b_addr} spi 0x1000 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 0
 	${ns_a} ip -6 xfrm state add src ${veth6_b_addr} dst ${veth6_a_addr} spi 0x1001 proto esp aead "rfc4106(gcm(aes))" 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
@@ -114,6 +149,20 @@ mtu_parse() {
 	done
 }
 
+link_get() {
+	ns_cmd="${1}"
+	name="${2}"
+
+	${ns_cmd} ip link show dev "${name}"
+}
+
+link_get_mtu() {
+	ns_cmd="${1}"
+	name="${2}"
+
+	mtu_parse "$(link_get "${ns_cmd}" ${name})"
+}
+
 route_get_dst_exception() {
 	ns_cmd="${1}"
 	dst="${2}"
@@ -134,8 +183,8 @@ test_pmtu_vti6_exception() {
 	# Create route exception by exceeding link layer MTU
 	mtu "${ns_a}" veth_a 4000
 	mtu "${ns_b}" veth_b 4000
-	mtu "${ns_a}" vti_a 5000
-	mtu "${ns_b}" vti_b 5000
+	mtu "${ns_a}" vti6_a 5000
+	mtu "${ns_b}" vti6_b 5000
 	${ns_a} ping6 -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null
 
 	# Check that exception was created
@@ -145,7 +194,7 @@ test_pmtu_vti6_exception() {
 	fi
 
 	# Decrease tunnel MTU, check for PMTU decrease in route exception
-	mtu "${ns_a}" vti_a 3000
+	mtu "${ns_a}" vti6_a 3000
 
 	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 3000 ]; then
 		echo "  decreasing tunnel MTU didn't decrease route exception PMTU"
@@ -153,7 +202,7 @@ test_pmtu_vti6_exception() {
 	fi
 
 	# Increase tunnel MTU, check for PMTU increase in route exception
-	mtu "${ns_a}" vti_a 9000
+	mtu "${ns_a}" vti6_a 9000
 	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 9000 ]; then
 		echo "  increasing tunnel MTU didn't increase route exception PMTU"
 		return 0
@@ -162,6 +211,20 @@ test_pmtu_vti6_exception() {
 	return 1
 }
 
+test_pmtu_vti4_default_mtu() {
+	setup namespaces veth vti4 && return 2
+
+	# Check that MTU of vti device is MTU of veth minus IPv4 header length
+	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
+	vti4_mtu="$(link_get_mtu "${ns_a}" vti4_a)"
+	if [ $((veth_mtu - vti4_mtu)) -ne 20 ]; then
+		echo "  vti MTU ${vti4_mtu} is not veth MTU ${veth_mtu} minus IPv4 header length"
+		return 0
+	fi
+
+	return 1
+}
+
 trap cleanup EXIT
 
 exitcode=0
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 3/9] selftests: pmtu: Introduce support for multiple tests
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

Introduce list of tests and loop on it in main body.

Tests will now just take care of calling setup with a list of
"units" they need, and return 0 for failure, 1 for success, 2 if
the test had to be skipped.

Main script body will take care of displaying results and
cleaning up after every test. Introduce guard variable so that
we don't clean up twice in case of interrupts or unexpected
failures.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 58 ++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index fc79cc6f49a6..feebb6fcff21 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -5,13 +5,15 @@
 #
 # Tests currently implemented:
 #
-# - test_pmtu_vti6_exception
+# - pmtu_vti6_exception
 #	Set up vti6 tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is
 #	created by exceeding link layer MTU with ping to other endpoint. Then
 #	decrease and increase MTU of tunnel, checking that route exception PMTU
 #	changes accordingly
 
+tests="pmtu_vti6_exception"
+
 NS_A="ns-$(mktemp -u XXXXXX)"
 NS_B="ns-$(mktemp -u XXXXXX)"
 ns_a="ip netns exec ${NS_A}"
@@ -25,6 +27,8 @@ vti6_a_addr="fd00:2::a"
 vti6_b_addr="fd00:2::b"
 vti6_mask="64"
 
+cleanup_done=1
+
 setup_namespaces() {
 	ip netns add ${NS_A} || return 0
 	ip netns add ${NS_B}
@@ -75,26 +79,21 @@ setup_xfrm() {
 }
 
 setup() {
-	tunnel_type="$1"
+	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return 0
 
-	[ "$(id -u)" -ne 0 ] && echo "SKIP: need to run as root" && exit 0
-
-	setup_namespaces && echo "SKIP: namespaces not supported" && exit 0
-	setup_veth && echo "SKIP: veth not supported" && exit 0
+	cleanup_done=0
+	for arg do
+		eval setup_${arg} && echo "  ${arg} not supported" && return 0
+	done
 
-	case ${tunnel_type} in
-	"vti6")
-		setup_vti6 && echo "SKIP: vti6 not supported" && exit 0
-		setup_xfrm && echo "SKIP: xfrm not supported" && exit 0
-		;;
-	*)
-		;;
-	esac
+	return 1
 }
 
 cleanup() {
+	[ ${cleanup_done} -eq 1 ] && return
 	ip netns del ${NS_A} 2 > /dev/null
 	ip netns del ${NS_B} 2 > /dev/null
+	cleanup_done=1
 }
 
 mtu() {
@@ -130,7 +129,7 @@ route_get_dst_pmtu_from_exception() {
 }
 
 test_pmtu_vti6_exception() {
-	setup vti6
+	setup namespaces veth vti6 xfrm && return 2
 
 	# Create route exception by exceeding link layer MTU
 	mtu "${ns_a}" veth_a 4000
@@ -141,30 +140,41 @@ test_pmtu_vti6_exception() {
 
 	# Check that exception was created
 	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" = "" ]; then
-		echo "FAIL: Tunnel exceeding link layer MTU didn't create route exception"
-		exit 1
+		echo "  tunnel exceeding link layer MTU didn't create route exception"
+		return 0
 	fi
 
 	# Decrease tunnel MTU, check for PMTU decrease in route exception
 	mtu "${ns_a}" vti_a 3000
 
 	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 3000 ]; then
-		echo "FAIL: Decreasing tunnel MTU didn't decrease route exception PMTU"
-		exit 1
+		echo "  decreasing tunnel MTU didn't decrease route exception PMTU"
+		return 0
 	fi
 
 	# Increase tunnel MTU, check for PMTU increase in route exception
 	mtu "${ns_a}" vti_a 9000
 	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 9000 ]; then
-		echo "FAIL: Increasing tunnel MTU didn't increase route exception PMTU"
-		exit 1
+		echo "  increasing tunnel MTU didn't increase route exception PMTU"
+		return 0
 	fi
 
-	echo "PASS"
+	return 1
 }
 
 trap cleanup EXIT
 
-test_pmtu_vti6_exception
+exitcode=0
+for name in ${tests}; do
+	echo "${name}: START"
+	eval test_${name}
+	ret=$?
+	cleanup
+
+	if [ $ret -eq 0 ]; then echo "${name}: FAIL"; exitcode=1
+	elif [ $ret -eq 1 ]; then echo "${name}: PASS"
+	elif [ $ret -eq 2 ]; then echo "${name}: SKIP"
+	fi
+done
 
-exit 0
+exit ${exitcode}
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 2/9] selftests: pmtu: Factor out MTU parsing helper
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

...so that it can be used for any iproute command output.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 7a7845e415e4..fc79cc6f49a6 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -105,6 +105,16 @@ mtu() {
 	${ns_cmd} ip link set dev ${dev} mtu ${mtu}
 }
 
+mtu_parse() {
+	input="${1}"
+
+	next=0
+	for i in ${input}; do
+		[ ${next} -eq 1 ] && echo "${i}" && return
+		[ "${i}" = "mtu" ] && next=1
+	done
+}
+
 route_get_dst_exception() {
 	ns_cmd="${1}"
 	dst="${2}"
@@ -116,12 +126,7 @@ route_get_dst_pmtu_from_exception() {
 	ns_cmd="${1}"
 	dst="${2}"
 
-	exception="$(route_get_dst_exception "${ns_cmd}" ${dst})"
-	next=0
-	for i in ${exception}; do
-		[ ${next} -eq 1 ] && echo "${i}" && return
-		[ "${i}" = "mtu" ] && next=1
-	done
+	mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
 }
 
 test_pmtu_vti6_exception() {
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 1/9] selftests: pmtu: Use namespace command prefix to fetch route mtu
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <cover.1521129192.git.sbrivio@redhat.com>

In commit 7af137b72131 ("selftests: net: Introduce first PMTU
test") I accidentally assumed route_get_* helpers would run from
a single namespace. Make them a bit more generic, by passing the
namespace command prefix as a parameter instead.

Fixes: 7af137b72131 ("selftests: net: Introduce first PMTU test")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tools/testing/selftests/net/pmtu.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 6c19c148cef8..7a7845e415e4 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -106,15 +106,17 @@ mtu() {
 }
 
 route_get_dst_exception() {
-	dst="${1}"
+	ns_cmd="${1}"
+	dst="${2}"
 
-	${ns_a} ip route get "${dst}"
+	${ns_cmd} ip route get "${dst}"
 }
 
 route_get_dst_pmtu_from_exception() {
-	dst="${1}"
+	ns_cmd="${1}"
+	dst="${2}"
 
-	exception="$(route_get_dst_exception ${dst})"
+	exception="$(route_get_dst_exception "${ns_cmd}" ${dst})"
 	next=0
 	for i in ${exception}; do
 		[ ${next} -eq 1 ] && echo "${i}" && return
@@ -133,7 +135,7 @@ test_pmtu_vti6_exception() {
 	${ns_a} ping6 -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null
 
 	# Check that exception was created
-	if [ "$(route_get_dst_pmtu_from_exception ${vti6_b_addr})" = "" ]; then
+	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" = "" ]; then
 		echo "FAIL: Tunnel exceeding link layer MTU didn't create route exception"
 		exit 1
 	fi
@@ -141,14 +143,14 @@ test_pmtu_vti6_exception() {
 	# Decrease tunnel MTU, check for PMTU decrease in route exception
 	mtu "${ns_a}" vti_a 3000
 
-	if [ "$(route_get_dst_pmtu_from_exception ${vti6_b_addr})" -ne 3000 ]; then
+	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 3000 ]; then
 		echo "FAIL: Decreasing tunnel MTU didn't decrease route exception PMTU"
 		exit 1
 	fi
 
 	# Increase tunnel MTU, check for PMTU increase in route exception
 	mtu "${ns_a}" vti_a 9000
-	if [ "$(route_get_dst_pmtu_from_exception ${vti6_b_addr})" -ne 9000 ]; then
+	if [ "$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" -ne 9000 ]; then
 		echo "FAIL: Increasing tunnel MTU didn't increase route exception PMTU"
 		exit 1
 	fi
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next 0/9] selftests: pmtu: Add further vti/vti6 MTU and PMTU tests
From: Stefano Brivio @ 2018-03-15 16:18 UTC (permalink / raw)
  To: David S . Miller; +Cc: Sabrina Dubroca, Steffen Klassert, netdev

Patches 4/9 to 9/9 add tests to verify default MTU assignment for
vti4 and vti6 interfaces, to check that MTU values set on new
link and link changes are properly taken and validated, and to
verify PMTU exceptions on vti4 interfaces.

Patch 1/9 fixes the helper to fetch exceptions MTU to run in the
passed namespace.

Patches 2/9 and 3/9 are preparation work to make it easier to
introduce those tests.

Stefano Brivio (9):
  selftests: pmtu: Use namespace command prefix to fetch route mtu
  selftests: pmtu: Factor out MTU parsing helper
  selftests: pmtu: Introduce support for multiple tests
  selftests: pmtu: Add pmtu_vti4_default_mtu test
  selftests: pmtu: Add pmtu_vti6_default_mtu test
  selftests: pmtu: Add pmtu_vti4_exception test
  selftests: pmtu: Add pmtu_vti4_link_add_mtu test
  selftests: pmtu: Add pmtu_vti6_link_add_mtu test
  selftests: pmtu: Add pmtu_vti6_link_change_mtu test

 tools/testing/selftests/net/pmtu.sh | 413 +++++++++++++++++++++++++++++++-----
 1 file changed, 357 insertions(+), 56 deletions(-)

-- 
2.15.1

^ permalink raw reply

* [PATCH net 3/3] vti6: Fix dev->max_mtu setting
From: Stefano Brivio @ 2018-03-15 16:17 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Alexey Kodanev,
	Jarod Wilson, netdev
In-Reply-To: <cover.1521128962.git.sbrivio@redhat.com>

We shouldn't allow a tunnel to have IP_MAX_MTU as MTU, because
another IPv6 header is going on top of our packets. Without this
patch, we might end up building packets bigger than IP_MAX_MTU.

Fixes: b96f9afee4eb ("ipv4/6: use core net MTU range checking")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv6/ip6_vti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 971175142e14..ce18cd20389d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -879,7 +879,7 @@ static void vti6_dev_setup(struct net_device *dev)
 
 	dev->type = ARPHRD_TUNNEL6;
 	dev->min_mtu = IPV6_MIN_MTU;
-	dev->max_mtu = IP_MAX_MTU;
+	dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr);
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	netif_keep_dst(dev);
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 2/3] vti6: Keep set MTU on link creation or change, validate it
From: Stefano Brivio @ 2018-03-15 16:17 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Alexey Kodanev,
	Jarod Wilson, netdev
In-Reply-To: <cover.1521128962.git.sbrivio@redhat.com>

In vti6_link_config(), if MTU is already given on link creation
or change, validate and use it instead of recomputing it. To do
that, we need to propagate the knowledge that MTU was set by
userspace all the way down to vti6_link_config().

To keep this simple, vti6_dev_init() sets the new 'keep_mtu'
argument of vti6_link_config() to true: on initialization, we
don't have convenient access to netlink attributes there, but we
will anyway check whether dev->mtu is set in vti6_link_config().
If it's non-zero, it was set to the value of the IFLA_MTU
attribute during creation. Otherwise, determine a reasonable
value.

Fixes: ed1efb2aefbb ("ipv6: Add support for IPsec virtual tunnel interfaces")
Fixes: 53c81e95df17 ("ip6_vti: adjust vti mtu according to mtu of lower device")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv6/ip6_vti.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 2ceef41cc097..971175142e14 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -622,7 +622,7 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
-static void vti6_link_config(struct ip6_tnl *t)
+static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu)
 {
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
@@ -641,6 +641,11 @@ static void vti6_link_config(struct ip6_tnl *t)
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
 
+	if (keep_mtu && dev->mtu) {
+		dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu);
+		return;
+	}
+
 	if (p->flags & IP6_TNL_F_CAP_XMIT) {
 		int strict = (ipv6_addr_type(&p->raddr) &
 			      (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
@@ -668,12 +673,14 @@ static void vti6_link_config(struct ip6_tnl *t)
  * vti6_tnl_change - update the tunnel parameters
  *   @t: tunnel to be changed
  *   @p: tunnel configuration parameters
+ *   @keep_mtu: MTU was set from userspace, don't re-compute it
  *
  * Description:
  *   vti6_tnl_change() updates the tunnel parameters
  **/
 static int
-vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
+vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
+		bool keep_mtu)
 {
 	t->parms.laddr = p->laddr;
 	t->parms.raddr = p->raddr;
@@ -683,11 +690,12 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
 	t->parms.proto = p->proto;
 	t->parms.fwmark = p->fwmark;
 	dst_cache_reset(&t->dst_cache);
-	vti6_link_config(t);
+	vti6_link_config(t, keep_mtu);
 	return 0;
 }
 
-static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
+		       bool keep_mtu)
 {
 	struct net *net = dev_net(t->dev);
 	struct vti6_net *ip6n = net_generic(net, vti6_net_id);
@@ -695,7 +703,7 @@ static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
 
 	vti6_tnl_unlink(ip6n, t);
 	synchronize_net();
-	err = vti6_tnl_change(t, p);
+	err = vti6_tnl_change(t, p, keep_mtu);
 	vti6_tnl_link(ip6n, t);
 	netdev_state_change(t->dev);
 	return err;
@@ -808,7 +816,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			} else
 				t = netdev_priv(dev);
 
-			err = vti6_update(t, &p1);
+			err = vti6_update(t, &p1, false);
 		}
 		if (t) {
 			err = 0;
@@ -907,7 +915,7 @@ static int vti6_dev_init(struct net_device *dev)
 
 	if (err)
 		return err;
-	vti6_link_config(t);
+	vti6_link_config(t, true);
 	return 0;
 }
 
@@ -1012,7 +1020,7 @@ static int vti6_changelink(struct net_device *dev, struct nlattr *tb[],
 	} else
 		t = netdev_priv(dev);
 
-	return vti6_update(t, &p);
+	return vti6_update(t, &p, tb && tb[IFLA_MTU]);
 }
 
 static size_t vti6_get_size(const struct net_device *dev)
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 1/3] vti6: Properly adjust vti6 MTU from MTU of lower device
From: Stefano Brivio @ 2018-03-15 16:17 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Alexey Kodanev,
	Jarod Wilson, netdev
In-Reply-To: <cover.1521128962.git.sbrivio@redhat.com>

If a lower device is found, we don't need to subtract
LL_MAX_HEADER to calculate our MTU: just use its MTU, the link
layer headers are already taken into account by it.

If the lower device is not found, start from ETH_DATA_LEN
instead, and only in this case subtract a worst-case
LL_MAX_HEADER.

We then need to subtract our additional IPv6 header from the
calculation.

While at it, note that vti6 doesn't have a hardware header, so
it doesn't need to set dev->hard_header_len. And as
vti6_link_config() now always sets the MTU, there's no need to
set a default value in vti6_dev_setup().

This makes the behaviour consistent with IPv4 vti, after
commit a32452366b72 ("vti4: Don't count header length twice."),
which was accidentally reverted by merge commit f895f0cfbb77
("Merge branch 'master' of
git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec").

While commit 53c81e95df17 ("ip6_vti: adjust vti mtu according to
mtu of lower device") improved on the original situation, this
was still not ideal. As reported in that commit message itself,
if we start from an underlying veth MTU of 9000, we end up with
an MTU of 8832, that is, 9000 - LL_MAX_HEADER - sizeof(ipv6hdr).
This should simply be 8880, or 9000 - sizeof(ipv6hdr) instead:
we found the lower device (veth) and we know we don't have any
additional link layer header, so there's no need to subtract an
hypothetical worst-case number.

Fixes: 53c81e95df17 ("ip6_vti: adjust vti mtu according to mtu of lower device")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv6/ip6_vti.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index fa3ae1cb50d3..2ceef41cc097 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -627,6 +627,7 @@ static void vti6_link_config(struct ip6_tnl *t)
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
 	struct net_device *tdev = NULL;
+	int mtu;

 	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
 	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
@@ -656,8 +657,11 @@ static void vti6_link_config(struct ip6_tnl *t)
 		tdev = __dev_get_by_index(t->net, p->link);

 	if (tdev)
-		dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len,
-				 IPV6_MIN_MTU);
+		mtu = tdev->mtu - sizeof(struct ipv6hdr);
+	else
+		mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr);
+
+	dev->mtu = max_t(int, mtu, IPV6_MIN_MTU);
 }

 /**
@@ -866,8 +870,6 @@ static void vti6_dev_setup(struct net_device *dev)
 	dev->priv_destructor = vti6_dev_free;

 	dev->type = ARPHRD_TUNNEL6;
-	dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
-	dev->mtu = ETH_DATA_LEN;
 	dev->min_mtu = IPV6_MIN_MTU;
 	dev->max_mtu = IP_MAX_MTU;
 	dev->flags |= IFF_NOARP;
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 0/3] vti6: Fixes for MTU assignment and validation
From: Stefano Brivio @ 2018-03-15 16:17 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Alexey Kodanev,
	Jarod Wilson, netdev

This series introduces fixes to ensure that default MTU
assignment for vti6 is properly calculated (and to make it
consistent with vti4), to ensure we always use the MTU from
userspace (and validate it), if given, when a link is created or
changed, and that the MTU upper bound is properly set.

Stefano Brivio (3):
  vti6: Properly adjust vti6 MTU from MTU of lower device
  vti6: Keep set MTU on link creation or change, validate it
  vti6: Fix dev->max_mtu setting

 net/ipv6/ip6_vti.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

-- 
2.15.1

^ permalink raw reply

* [PATCH net 3/3] vti4: Don't override MTU passed on link creation via IFLA_MTU
From: Stefano Brivio @ 2018-03-15 16:16 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Pravin Shelar,
	netdev
In-Reply-To: <cover.1521068627.git.sbrivio@redhat.com>

Don't hardcode a MTU value on vti tunnel initialization,
ip_tunnel_newlink() is able to deal with this already. See also
commit ffc2b6ee4174 ("ip_gre: fix IFLA_MTU ignored on NEWLINK").

Fixes: 1181412c1a67 ("net/ipv4: VTI support new module for ip_vti.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv4/ip_vti.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 502e5222eaa9..3f091ccad9af 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -387,7 +387,6 @@ static int vti_tunnel_init(struct net_device *dev)
 	memcpy(dev->dev_addr, &iph->saddr, 4);
 	memcpy(dev->broadcast, &iph->daddr, 4);
 
-	dev->mtu		= ETH_DATA_LEN;
 	dev->flags		= IFF_NOARP;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_LLTX;
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 2/3] ip_tunnel: Clamp MTU to bounds on new link
From: Stefano Brivio @ 2018-03-15 16:16 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Pravin Shelar,
	netdev
In-Reply-To: <cover.1521068627.git.sbrivio@redhat.com>

Otherwise, it's possible to specify invalid MTU values directly
on creation of a link (via 'ip link add'). This is already
prevented on subsequent MTU changes by commit b96f9afee4eb
("ipv4/6: use core net MTU range checking").

Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv4/ip_tunnel.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6d21068f9b55..7c76dd17b6b9 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1108,8 +1108,14 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		eth_hw_addr_random(dev);
 
 	mtu = ip_tunnel_bind_dev(dev);
-	if (!tb[IFLA_MTU])
+	if (tb[IFLA_MTU]) {
+		unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
+
+		dev->mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
+				 (unsigned int)(max - sizeof(struct iphdr)));
+	} else {
 		dev->mtu = mtu;
+	}
 
 	ip_tunnel_add(itn, nt);
 out:
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 1/3] vti4: Don't count header length twice on tunnel setup
From: Stefano Brivio @ 2018-03-15 16:16 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Pravin Shelar,
	netdev
In-Reply-To: <cover.1521068627.git.sbrivio@redhat.com>

This re-introduces the effect of commit a32452366b72 ("vti4:
Don't count header length twice.") which was accidentally
reverted by merge commit f895f0cfbb77 ("Merge branch 'master' of
git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec").

The commit message from Steffen Klassert said:

    We currently count the size of LL_MAX_HEADER and struct iphdr
    twice for vti4 devices, this leads to a wrong device mtu.
    The size of LL_MAX_HEADER and struct iphdr is already counted in
    ip_tunnel_bind_dev(), so don't do it again in vti_tunnel_init().

And this is still the case now: ip_tunnel_bind_dev() already
accounts for the header length of the link layer (not
necessarily LL_MAX_HEADER, if the output device is found), plus
one IP header.

For example, with a vti device on top of veth, with MTU of 1500,
the existing implementation would set the initial vti MTU to
1332, accounting once for LL_MAX_HEADER (128, included in
hard_header_len by vti) and twice for the same IP header (once
from hard_header_len, once from ip_tunnel_bind_dev()).

It should instead be 1480, because ip_tunnel_bind_dev() is able
to figure out that the output device is veth, so no additional
link layer header is attached, and will properly count one
single IP header.

The existing issue had the side effect of avoiding PMTUD for
most xfrm policies, by arbitrarily lowering the initial MTU.
However, the only way to get a consistent PMTU value is to let
the xfrm PMTU discovery do its course, and commit d6af1a31cc72
("vti: Add pmtu handling to vti_xmit.") now takes care of local
delivery cases where the application ignores local socket
notifications.

Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code")
Fixes: f895f0cfbb77 ("Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
---
 net/ipv4/ip_vti.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 51b1669334fe..502e5222eaa9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -387,7 +387,6 @@ static int vti_tunnel_init(struct net_device *dev)
 	memcpy(dev->dev_addr, &iph->saddr, 4);
 	memcpy(dev->broadcast, &iph->daddr, 4);

-	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr);
 	dev->mtu		= ETH_DATA_LEN;
 	dev->flags		= IFF_NOARP;
 	dev->addr_len		= 4;
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 0/3] vti4, ip_tunnel: Fixes for MTU assignment and validation
From: Stefano Brivio @ 2018-03-15 16:16 UTC (permalink / raw)
  To: David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, Herbert Xu, Pravin Shelar,
	netdev

Patch 1/3 re-introduces a fix to ensure that default MTU on new
link is not lowered unnecessarily because of double counting of
headers. This fix was originally introduced in 2014 and got lost
in a merge commit shortly afterwards.

Patches 2/3 and 3/3 ensure that MTU passed from userspace on link
creation is taken into account and also properly validated.

Stefano Brivio (3):
  vti4: Don't count header length twice on tunnel setup
  ip_tunnel: Clamp MTU to bounds on new link
  vti4: Don't override MTU passed on link creation via IFLA_MTU

 net/ipv4/ip_tunnel.c | 8 +++++++-
 net/ipv4/ip_vti.c    | 2 --
 2 files changed, 7 insertions(+), 3 deletions(-)

-- 
2.15.1

^ permalink raw reply

* Re: [RFC,POC] iptables/nftables to epbf/xdp via common intermediate layer
From: Alexei Starovoitov @ 2018-03-15 16:13 UTC (permalink / raw)
  To: Edward Cree
  Cc: Florian Westphal, Daniel Borkmann, netdev, ast, pablo,
	David S. Miller
In-Reply-To: <f6b7ead2-8c4e-8d87-abad-b03f23d6de84@solarflare.com>

On Tue, Mar 06, 2018 at 06:18:04PM +0000, Edward Cree wrote:
> On 06/03/18 18:03, Florian Westphal wrote:
> > I don't know.  I suspect we should go for naive algorithm only,
> > but I would defer such decision to Alexei/Daniel.
> >
> > f.e. i don't know if using llvm is a good idea or not,
> Yeah, I wondered about that too.ï¿½ I think it's probably not a good idea,
> ï¿½since it's a big project with a complex interface and the tail would
> ï¿½likely wag the dog.
> 
> > I did not
> > intend to turn proposed imr into full blown compiler in any case,
> > I only want to avoid code duplication for iptables/nftables -> ebpf
> > translator.
> Well, I think anything that calling code does by hand will entail limits
> ï¿½on what the imr->ebpf step can do.ï¿½ E.g. if imr uses registers by name
> ï¿½then the backend can't expand any highlevel constructs that need to use
> ï¿½scratch registers, unless some regs are reserved for the backend and
> ï¿½can't be used by the imr.
> 
> Ultimately I think the imr will have to become practically a compiler
> ï¿½back-end, because any jobs it punts to the caller will necessarily be
> ï¿½duplicated between iptables and nftables.

few years back we had a prototype of nft->bpf converter.
It was reading nft netlink, generating really dumb C code,
and feeding into clang/llvm. Performance of generated code was
surprisingly pretty good. llvm did particularly well combining
the same loads from multiple rules into single load and
combining all 'if' conditions into a few.
So full blown compiler approach known to work, but ...
we cannot use it here.
We cannot have bpfilter umh module depend on llvm.
The converter have to be fully contained within kernel source tree.
It's probably fine to start simple with compiler-like thingy and
later improve it with sophisticated optimizations and fancy
register allocator, but I suspect we can achieve better performance
and easier to maintain code if we specialize the converters.

The way this IMR defined today looks pretty much like nft and
it feels a bit too low level than iptable conversion would need.
Converting iptable to this IMR is like going old days of gcc
when C/C++ front-ends were generating RTL and all optimizations
were done on RTL representation. Eventually gcc had to adopt
gimple (variant of ssa) format to do optimizations that were
very hard to do with RTL.
I suspect if we go iptable->nft (same as this imr) the history
will repeat itself. It will work, but will be difficult to apply
interesting optimizations.

Similarly I don't quite see why this particular IMR is needed
for nft->bpf either.
I think it would be simpler to have user space only extensions
and opcodes added to bpf for the purpose of the translation.
Like there is no bpf instruction called 'load from IP header',
but we can make one. Just extend extended bpf with an instruction
like this and on the first pass do full conversion of nft
directly into this 'extended extended bpf'.
Then do second pass of lowering 'extended extended bpf' into eBPF
that kernel actually understands.
That would be similar to what llvm does when it lowers higher level
represenation into lower one.
The higher level IR doesn't need to be completely different
from lower level IR. Often lower level IR is a subset of higher level.

For iptable->bpf I think it's easier to construct decision tree
out of all iptable rules, optimize it as a tree and then
convert it into a set of map lookups glued with a bit of bpf code.
Converting iptable rules one by one isn't going to scale.
It was fine for prototype, but now we're talking production.
If there are 1k iptables rules and we generate just 4 bpf insns
for every rule, we will already hit 4k insn limit.
We can easily bump the limit, but even if it's 128k insns
it's still the limit. All iptables rules should be converted together.

^ permalink raw reply

* Re: [PATCH] net: dev_forward_skb(): Scrub packet's per-netns info only when crossing netns
From: Shmulik Ladkani @ 2018-03-15 16:11 UTC (permalink / raw)
  To: Liran Alon
  Cc: mrv, netdev, daniel, davem, linux-kernel, yuval.shaia, idan.brown
In-Reply-To: <58e67195-56f6-4d01-8747-f8322a382358@default>

On Thu, 15 Mar 2018 08:01:03 -0700 (PDT) Liran Alon <liran.alon@oracle.com> wrote:
> 
> I still think that default behavior should be to zero skb->mark only when skb
> cross netdevs in different netns.

But the previous default was scrub the mark in *both* xnet and non-xnet
situations.

Therefore, there might be users which RELY on this (strange) default
behavior in their same-netns-veth-pair setups.
Meaning, changing the default behavior might break their apps relying on
the former default behavior.

This is why the "disable mark scrubbing in non-xnet case" should be opt-in.

Regards,
Shmulik

^ permalink raw reply

* Re: [net-next 1/5] tipc: obsolete TIPC_ZONE_SCOPE
From: Jiri Pirko @ 2018-03-15 16:11 UTC (permalink / raw)
  To: Jon Maloy
  Cc: davem, netdev, mohan.krishna.ghanta.krishnamurthy, tung.q.nguyen,
	hoang.h.le, canh.d.luu, ying.xue, tipc-discussion
In-Reply-To: <1521128935-6141-3-git-send-email-jon.maloy@ericsson.com>

Thu, Mar 15, 2018 at 04:48:51PM CET, jon.maloy@ericsson.com wrote:
>Publications for TIPC_CLUSTER_SCOPE and TIPC_ZONE_SCOPE are in all
>aspects handled the same way, both on the publishing node and on the
>receiving nodes.
>
>Despite previous ambitions to the contrary, this is never going to change,
>so we take the conseqeunce of this and obsolete TIPC_ZONE_SCOPE and related
>macros/functions. Whenever a user is doing a bind() or a sendmsg() attempt
>using ZONE_SCOPE we translate this internally to CLUSTER_SCOPE, while we
>remain compatible with users and remote nodes still using ZONE_SCOPE.
>
>Furthermore, the non-formalized scope value 0 has always been permitted
>for use during lookup, with the same meaning as ZONE_SCOPE/CLUSTER_SCOPE.
>We now permit it even as binding scope, but for compatibility reasons we
>choose to not change the value of TIPC_CLUSTER_SCOPE.
>
>Acked-by: Ying Xue <ying.xue@windriver.com>
>Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>

[...]


>diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
>index 14bacc7..4ac9f1f 100644
>--- a/include/uapi/linux/tipc.h
>+++ b/include/uapi/linux/tipc.h
>@@ -61,50 +61,6 @@ struct tipc_name_seq {
> 	__u32 upper;
> };
> 
>-/* TIPC Address Size, Offset, Mask specification for Z.C.N
>- */
>-#define TIPC_NODE_BITS          12
>-#define TIPC_CLUSTER_BITS       12
>-#define TIPC_ZONE_BITS          8
>-
>-#define TIPC_NODE_OFFSET        0
>-#define TIPC_CLUSTER_OFFSET     TIPC_NODE_BITS
>-#define TIPC_ZONE_OFFSET        (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS)
>-
>-#define TIPC_NODE_SIZE          ((1UL << TIPC_NODE_BITS) - 1)
>-#define TIPC_CLUSTER_SIZE       ((1UL << TIPC_CLUSTER_BITS) - 1)
>-#define TIPC_ZONE_SIZE          ((1UL << TIPC_ZONE_BITS) - 1)
>-
>-#define TIPC_NODE_MASK		(TIPC_NODE_SIZE << TIPC_NODE_OFFSET)
>-#define TIPC_CLUSTER_MASK	(TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET)
>-#define TIPC_ZONE_MASK		(TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET)
>-
>-#define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK)
>-
>-static inline __u32 tipc_addr(unsigned int zone,
>-			      unsigned int cluster,
>-			      unsigned int node)
>-{
>-	return (zone << TIPC_ZONE_OFFSET) |
>-		(cluster << TIPC_CLUSTER_OFFSET) |
>-		node;
>-}
>-
>-static inline unsigned int tipc_zone(__u32 addr)
>-{
>-	return addr >> TIPC_ZONE_OFFSET;
>-}
>-
>-static inline unsigned int tipc_cluster(__u32 addr)
>-{
>-	return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET;
>-}
>-
>-static inline unsigned int tipc_node(__u32 addr)
>-{
>-	return addr & TIPC_NODE_MASK;
>-}

If someone includes tipc.h and uses any of this, your patch is going to
break his compilation. Would anyone have good reason to use any of this?

^ permalink raw reply

* [PATCH linux-stable-4.14] tcp: reset sk_send_head in tcp_write_queue_purge
From: Soheil Hassas Yeganeh @ 2018-03-15 16:09 UTC (permalink / raw)
  To: davem, netdev
  Cc: ycheng, ncardwell, yongjianchn, nefelim4ag, Soheil Hassas Yeganeh,
	Eric Dumazet

From: Soheil Hassas Yeganeh <soheil@google.com>

tcp_write_queue_purge clears all the SKBs in the write queue
but does not reset the sk_send_head. As a result, we can have
a NULL pointer dereference anywhere that we use tcp_send_head
instead of the tcp_write_queue_tail.

For example, after 27fid7a8ed38 (tcp: purge write queue upon RST),
we can purge the write queue on RST. Prior to
75c119afe14f (tcp: implement rb-tree based retransmit queue),
tcp_push will only check tcp_send_head and then accesses
tcp_write_queue_tail to send the actual SKB. As a result, it will
dereference a NULL pointer.

This has been reported twice for 4.14 where we don't have
75c119afe14f:

By Timofey Titovets:

[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

By Yongjian Xu:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: tcp_push+0x48/0x120
PGD 80000007ff77b067 P4D 80000007ff77b067 PUD 7fd989067 PMD 0
Oops: 0002 [#18] SMP PTI
Modules linked in: tcp_diag inet_diag tcp_bbr sch_fq iTCO_wdt
iTCO_vendor_support pcspkr ixgbe mdio i2c_i801 lpc_ich joydev input_leds shpchp
e1000e igb dca ptp pps_core hwmon mei_me mei ipmi_si ipmi_msghandler sg ses
scsi_transport_sas enclosure ext4 jbd2 mbcache sd_mod ahci libahci megaraid_sas
wmi ast ttm dm_mirror dm_region_hash dm_log dm_mod dax
CPU: 6 PID: 14156 Comm: [ET_NET 6] Tainted: G D 4.14.26-1.el6.x86_64 #1
Hardware name: LENOVO ThinkServer RD440 /ThinkServer RD440, BIOS A0TS80A
09/22/2014
task: ffff8807d78d8140 task.stack: ffffc9000e944000
RIP: 0010:tcp_push+0x48/0x120
RSP: 0018:ffffc9000e947a88 EFLAGS: 00010246
RAX: 00000000000005b4 RBX: ffff880f7cce9c00 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff8807d00f5000
RBP: ffffc9000e947aa8 R08: 0000000000001c84 R09: 0000000000000000
R10: ffff8807d00f5158 R11: 0000000000000000 R12: ffff8807d00f5000
R13: 0000000000000020 R14: 00000000000256d4 R15: 0000000000000000
FS: 00007f5916de9700(0000) GS:ffff88107fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000038 CR3: 00000007f8226004 CR4: 00000000001606e0
Call Trace:
tcp_sendmsg_locked+0x33d/0xe50
tcp_sendmsg+0x37/0x60
inet_sendmsg+0x39/0xc0
sock_sendmsg+0x49/0x60
sock_write_iter+0xb6/0x100
do_iter_readv_writev+0xec/0x130
? rw_verify_area+0x49/0xb0
do_iter_write+0x97/0xd0
vfs_writev+0x7e/0xe0
? __wake_up_common_lock+0x80/0xa0
? __fget_light+0x2c/0x70
? __do_page_fault+0x1e7/0x530
do_writev+0x60/0xf0
? inet_shutdown+0xac/0x110
SyS_writev+0x10/0x20
do_syscall_64+0x6f/0x140
? prepare_exit_to_usermode+0x8b/0xa0
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x3135ce0c57
RSP: 002b:00007f5916de4b00 EFLAGS: 00000293 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000003135ce0c57
RDX: 0000000000000002 RSI: 00007f5916de4b90 RDI: 000000000000606f
RBP: 0000000000000000 R08: 0000000000000000 R09: 00007f5916de8c38
R10: 0000000000000000 R11: 0000000000000293 R12: 00000000000464cc
R13: 00007f5916de8c30 R14: 00007f58d8bef080 R15: 0000000000000002
Code: 48 8b 97 60 01 00 00 4c 8d 97 58 01 00 00 41 b9 00 00 00 00 41 89 f3 4c 39
d2 49 0f 44 d1 41 81 e3 00 80 00 00 0f 85 b0 00 00 00 <80> 4a 38 08 44 8b 8f 74
06 00 00 44 89 8f 7c 06 00 00 83 e6 01
RIP: tcp_push+0x48/0x120 RSP: ffffc9000e947a88
CR2: 0000000000000038
---[ end trace 8d545c2e93515549 ]---

Fixes: a27fid7a8ed38 (tcp: purge write queue upon RST)
Reported-by: Timofey Titovets <nefelim4ag@gmail.com>
Reported-by: Yongjian Xu <yongjianchn@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
 include/net/tcp.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0a13574134b8b..d323d4fa742ca 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1600,6 +1600,11 @@ enum tcp_chrono {
 void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
 void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
 
+static inline void tcp_init_send_head(struct sock *sk)
+{
+	sk->sk_send_head = NULL;
+}
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
@@ -1610,6 +1615,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	tcp_init_send_head(sk);
 }
 
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
@@ -1672,11 +1678,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 		tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-	sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_queue_tail(&sk->sk_write_queue, skb);
-- 
2.16.2.804.g6dcf76e118-goog

^ permalink raw reply related

* Re: [PATCH] Bluetooth: btrsi: rework dependencies
From: Marcel Holtmann @ 2018-03-15 16:03 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Johan Hedberg, Kalle Valo, Linux Bluetooth mailing list, LKML,
	linux-wireless, Networking
In-Reply-To: <CAK8P3a3y00nUVuYopxXWaJjo_OxXhVsATyiSOfXz5b09JFq2dA@mail.gmail.com>

Hi Arnd,

>>> diff --git a/drivers/bluetooth/Makefile b/drivers/bluetooth/Makefile
>>> index 03cfc1b20c4a..9e8d22712ff3 100644
>>> --- a/drivers/bluetooth/Makefile
>>> +++ b/drivers/bluetooth/Makefile
>>> @@ -28,7 +28,7 @@ obj-$(CONFIG_BT_QCA)                += btqca.o
>>> 
>>> obj-$(CONFIG_BT_HCIUART_NOKIA)        += hci_nokia.o
>>> 
>>> -obj-$(CONFIG_BT_HCIRSI)              += btrsi.o
>>> +obj-$(CONFIG_BT_HCIRSI_MODULE)       += btrsi.o
>> 
>> do we need this new option? I have avoided these kind of complex things multi config entries. Can we not just select the RSI_91X?
>> 
> 
> I couldn't come up with a simpler way to do this.
> Selecting RSI_91X is not possible unless we make the BT
> driver 'depend on WLAN_VENDOR_RSI && MAC80211',
> which is even more backwards.
> 
> The problem here is that it's actually a reverse dependency:
> the wlan driver calls into the bt driver.
> 
> What we could do is to make BT_HCIRSI a silent symbol
> and have that selected by RSI_COEX, which can then
> be user-visible. With that, the Kconfig structure would follow
> what the code does.

that sounds a bit better to me. If the RSI driver maintainers don’t like that, then they should re-architect the code to make this more dynamic and flexible.

Regards

Marcel

^ permalink raw reply

* [PATCH net-next v3 7/7] ibmvnic: Update TX pool cleaning routine
From: Thomas Falcon @ 2018-03-15 16:02 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem, Thomas Falcon
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Update routine that cleans up any outstanding transmits that
have not received completions when the device needs to close.
Introduces a helper function that cleans one TX pool to make
code more readable.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 40 +++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index af6f819..0e74546 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1128,34 +1128,42 @@ static void clean_rx_pools(struct ibmvnic_adapter *adapter)
 	}
 }
 
-static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+static void clean_one_tx_pool(struct ibmvnic_adapter *adapter,
+			      struct ibmvnic_tx_pool *tx_pool)
 {
-	struct ibmvnic_tx_pool *tx_pool;
 	struct ibmvnic_tx_buff *tx_buff;
 	u64 tx_entries;
+	int i;
+
+	if (!tx_pool && !tx_pool->tx_buff)
+		return;
+
+	tx_entries = adapter->req_tx_entries_per_subcrq;
+
+	for (i = 0; i < tx_entries; i++) {
+		tx_buff = &tx_pool->tx_buff[i];
+		if (tx_buff && tx_buff->skb) {
+			dev_kfree_skb_any(tx_buff->skb);
+			tx_buff->skb = NULL;
+		}
+	}
+}
+
+static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+{
 	int tx_scrqs;
-	int i, j;
+	int i;
 
-	if (!adapter->tx_pool)
+	if (!adapter->tx_pool || !adapter->tso_pool)
 		return;
 
 	tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
-	tx_entries = adapter->req_tx_entries_per_subcrq;
 
 	/* Free any remaining skbs in the tx buffer pools */
 	for (i = 0; i < tx_scrqs; i++) {
-		tx_pool = &adapter->tx_pool[i];
-		if (!tx_pool && !tx_pool->tx_buff)
-			continue;
-
 		netdev_dbg(adapter->netdev, "Cleaning tx_pool[%d]\n", i);
-		for (j = 0; j < tx_entries; j++) {
-			tx_buff = &tx_pool->tx_buff[j];
-			if (tx_buff && tx_buff->skb) {
-				dev_kfree_skb_any(tx_buff->skb);
-				tx_buff->skb = NULL;
-			}
-		}
+		clean_one_tx_pool(adapter, &adapter->tx_pool[i]);
+		clean_one_tx_pool(adapter, &adapter->tso_pool[i]);
 	}
 }
 
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next v3 3/7] ibmvnic: Update release TX pool routine
From: Thomas Falcon @ 2018-03-15 16:02 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem, Thomas Falcon
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Introduce function that frees one TX pool.  Use that to release
each pool in both the standard TX pool and TSO pool arrays.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4dc3044..258d54e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -608,25 +608,30 @@ static void release_vpd_data(struct ibmvnic_adapter *adapter)
 	adapter->vpd = NULL;
 }
 
+static void release_one_tx_pool(struct ibmvnic_adapter *adapter,
+				struct ibmvnic_tx_pool *tx_pool)
+{
+	kfree(tx_pool->tx_buff);
+	kfree(tx_pool->free_map);
+	free_long_term_buff(adapter, &tx_pool->long_term_buff);
+}
+
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
 {
-	struct ibmvnic_tx_pool *tx_pool;
 	int i;
 
 	if (!adapter->tx_pool)
 		return;
 
 	for (i = 0; i < adapter->num_active_tx_pools; i++) {
-		netdev_dbg(adapter->netdev, "Releasing tx_pool[%d]\n", i);
-		tx_pool = &adapter->tx_pool[i];
-		kfree(tx_pool->tx_buff);
-		free_long_term_buff(adapter, &tx_pool->long_term_buff);
-		free_long_term_buff(adapter, &tx_pool->tso_ltb);
-		kfree(tx_pool->free_map);
+		release_one_tx_pool(adapter, &adapter->tx_pool[i]);
+		release_one_tx_pool(adapter, &adapter->tso_pool[i]);
 	}
 
 	kfree(adapter->tx_pool);
 	adapter->tx_pool = NULL;
+	kfree(adapter->tso_pool);
+	adapter->tso_pool = NULL;
 	adapter->num_active_tx_pools = 0;
 }
 
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next v3 6/7] ibmvnic: Improve TX buffer accounting
From: Thomas Falcon @ 2018-03-15 16:02 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem, Thomas Falcon
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Improve TX pool buffer accounting to prevent the producer
index from overruning the consumer. First, set the next free
index to an invalid value if it is in use. If next buffer
to be consumed is in use, drop the packet.

Finally, if the transmit fails for some other reason, roll
back the consumer index and set the free map entry to its original
value. This should also be done if the DMA map fails.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 672e922..af6f819 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1426,6 +1426,16 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 	index = tx_pool->free_map[tx_pool->consumer_index];
 
+	if (index == IBMVNIC_INVALID_MAP) {
+		dev_kfree_skb_any(skb);
+		tx_send_failed++;
+		tx_dropped++;
+		ret = NETDEV_TX_OK;
+		goto out;
+	}
+
+	tx_pool->free_map[tx_pool->consumer_index] = IBMVNIC_INVALID_MAP;
+
 	offset = index * tx_pool->buf_size;
 	dst = tx_pool->long_term_buff.buff + offset;
 	memset(dst, 0, tx_pool->buf_size);
@@ -1522,7 +1532,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 			tx_map_failed++;
 			tx_dropped++;
 			ret = NETDEV_TX_OK;
-			goto out;
+			goto tx_err_out;
 		}
 		lpar_rc = send_subcrq_indirect(adapter, handle_array[queue_num],
 					       (u64)tx_buff->indir_dma,
@@ -1534,13 +1544,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	}
 	if (lpar_rc != H_SUCCESS) {
 		dev_err(dev, "tx failed with code %ld\n", lpar_rc);
-
-		if (tx_pool->consumer_index == 0)
-			tx_pool->consumer_index =
-				tx_pool->num_buffers - 1;
-		else
-			tx_pool->consumer_index--;
-
 		dev_kfree_skb_any(skb);
 		tx_buff->skb = NULL;
 
@@ -1556,7 +1559,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		tx_send_failed++;
 		tx_dropped++;
 		ret = NETDEV_TX_OK;
-		goto out;
+		goto tx_err_out;
 	}
 
 	if (atomic_add_return(num_entries, &tx_scrq->used)
@@ -1569,7 +1572,16 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	tx_bytes += skb->len;
 	txq->trans_start = jiffies;
 	ret = NETDEV_TX_OK;
+	goto out;
 
+tx_err_out:
+	/* roll back consumer index and map array*/
+	if (tx_pool->consumer_index == 0)
+		tx_pool->consumer_index =
+			tx_pool->num_buffers - 1;
+	else
+		tx_pool->consumer_index--;
+	tx_pool->free_map[tx_pool->consumer_index] = index;
 out:
 	netdev->stats.tx_dropped += tx_dropped;
 	netdev->stats.tx_bytes += tx_bytes;
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next v3 5/7] ibmvnic: Update TX and TX completion routines
From: Thomas Falcon @ 2018-03-15 16:02 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem, Thomas Falcon
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Update TX and TX completion routines to account for TX pool
restructuring. TX routine first chooses the pool depending
on whether a packet is GSO or not, then uses it accordingly.

For the completion routine to know which pool it needs to use,
set the most significant bit of the correlator index to one
if the packet uses the TSO pool. On completion, unset the bit
and use the correlator index to release the buffer pool entry.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 55 +++++++++++++++++++-------------------
 drivers/net/ethernet/ibm/ibmvnic.h |  1 +
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 2bb5d56..672e922 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1414,8 +1414,11 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		ret = NETDEV_TX_OK;
 		goto out;
 	}
+	if (skb_is_gso(skb))
+		tx_pool = &adapter->tso_pool[queue_num];
+	else
+		tx_pool = &adapter->tx_pool[queue_num];
 
-	tx_pool = &adapter->tx_pool[queue_num];
 	tx_scrq = adapter->tx_scrq[queue_num];
 	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
 	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
@@ -1423,20 +1426,10 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 	index = tx_pool->free_map[tx_pool->consumer_index];
 
-	if (skb_is_gso(skb)) {
-		offset = tx_pool->tso_index * IBMVNIC_TSO_BUF_SZ;
-		dst = tx_pool->tso_ltb.buff + offset;
-		memset(dst, 0, IBMVNIC_TSO_BUF_SZ);
-		data_dma_addr = tx_pool->tso_ltb.addr + offset;
-		tx_pool->tso_index++;
-		if (tx_pool->tso_index == IBMVNIC_TSO_BUFS)
-			tx_pool->tso_index = 0;
-	} else {
-		offset = index * (adapter->req_mtu + VLAN_HLEN);
-		dst = tx_pool->long_term_buff.buff + offset;
-		memset(dst, 0, adapter->req_mtu + VLAN_HLEN);
-		data_dma_addr = tx_pool->long_term_buff.addr + offset;
-	}
+	offset = index * tx_pool->buf_size;
+	dst = tx_pool->long_term_buff.buff + offset;
+	memset(dst, 0, tx_pool->buf_size);
+	data_dma_addr = tx_pool->long_term_buff.addr + offset;
 
 	if (skb_shinfo(skb)->nr_frags) {
 		int cur, i;
@@ -1459,8 +1452,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 	tx_pool->consumer_index =
-	    (tx_pool->consumer_index + 1) %
-		adapter->req_tx_entries_per_subcrq;
+	    (tx_pool->consumer_index + 1) % tx_pool->num_buffers;
 
 	tx_buff = &tx_pool->tx_buff[index];
 	tx_buff->skb = skb;
@@ -1476,11 +1468,13 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	tx_crq.v1.n_crq_elem = 1;
 	tx_crq.v1.n_sge = 1;
 	tx_crq.v1.flags1 = IBMVNIC_TX_COMP_NEEDED;
-	tx_crq.v1.correlator = cpu_to_be32(index);
+
 	if (skb_is_gso(skb))
-		tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->tso_ltb.map_id);
+		tx_crq.v1.correlator =
+			cpu_to_be32(index | IBMVNIC_TSO_POOL_MASK);
 	else
-		tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->long_term_buff.map_id);
+		tx_crq.v1.correlator = cpu_to_be32(index);
+	tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->long_term_buff.map_id);
 	tx_crq.v1.sge_len = cpu_to_be32(skb->len);
 	tx_crq.v1.ioba = cpu_to_be64(data_dma_addr);
 
@@ -1543,7 +1537,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 		if (tx_pool->consumer_index == 0)
 			tx_pool->consumer_index =
-				adapter->req_tx_entries_per_subcrq - 1;
+				tx_pool->num_buffers - 1;
 		else
 			tx_pool->consumer_index--;
 
@@ -2547,6 +2541,7 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 			       struct ibmvnic_sub_crq_queue *scrq)
 {
 	struct device *dev = &adapter->vdev->dev;
+	struct ibmvnic_tx_pool *tx_pool;
 	struct ibmvnic_tx_buff *txbuff;
 	union sub_crq *next;
 	int index;
@@ -2566,7 +2561,14 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 				continue;
 			}
 			index = be32_to_cpu(next->tx_comp.correlators[i]);
-			txbuff = &adapter->tx_pool[pool].tx_buff[index];
+			if (index & IBMVNIC_TSO_POOL_MASK) {
+				tx_pool = &adapter->tso_pool[pool];
+				index &= ~IBMVNIC_TSO_POOL_MASK;
+			} else {
+				tx_pool = &adapter->tx_pool[pool];
+			}
+
+			txbuff = &tx_pool->tx_buff[index];
 
 			for (j = 0; j < IBMVNIC_MAX_FRAGS_PER_CRQ; j++) {
 				if (!txbuff->data_dma[j])
@@ -2589,11 +2591,10 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 
 			num_entries += txbuff->num_entries;
 
-			adapter->tx_pool[pool].free_map[adapter->tx_pool[pool].
-						     producer_index] = index;
-			adapter->tx_pool[pool].producer_index =
-			    (adapter->tx_pool[pool].producer_index + 1) %
-			    adapter->req_tx_entries_per_subcrq;
+			tx_pool->free_map[tx_pool->producer_index] = index;
+			tx_pool->producer_index =
+				(tx_pool->producer_index + 1) %
+					tx_pool->num_buffers;
 		}
 		/* remove tx_comp scrq*/
 		next->tx_comp.first = 0;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index a2e21b3..89efe70 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -43,6 +43,7 @@
 
 #define IBMVNIC_TSO_BUF_SZ	65536
 #define IBMVNIC_TSO_BUFS	64
+#define IBMVNIC_TSO_POOL_MASK	0x80000000
 
 #define IBMVNIC_MAX_LTB_SIZE ((1 << (MAX_ORDER - 1)) * PAGE_SIZE)
 #define IBMVNIC_BUFFER_HLEN 500
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next v3 4/7] ibmvnic: Update TX pool initialization routine
From: Thomas Falcon @ 2018-03-15 16:02 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem, Thomas Falcon
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Introduce function that initializes one TX pool. Use that to
create each pool entry in both the standard TX pool and TSO
pool arrays.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 90 ++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 258d54e..2bb5d56 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -635,13 +635,43 @@ static void release_tx_pools(struct ibmvnic_adapter *adapter)
 	adapter->num_active_tx_pools = 0;
 }
 
+static int init_one_tx_pool(struct net_device *netdev,
+			    struct ibmvnic_tx_pool *tx_pool,
+			    int num_entries, int buf_size)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int i;
+
+	tx_pool->tx_buff = kcalloc(num_entries,
+				   sizeof(struct ibmvnic_tx_buff),
+				   GFP_KERNEL);
+	if (!tx_pool->tx_buff)
+		return -1;
+
+	if (alloc_long_term_buff(adapter, &tx_pool->long_term_buff,
+				 num_entries * buf_size))
+		return -1;
+
+	tx_pool->free_map = kcalloc(num_entries, sizeof(int), GFP_KERNEL);
+	if (!tx_pool->free_map)
+		return -1;
+
+	for (i = 0; i < num_entries; i++)
+		tx_pool->free_map[i] = i;
+
+	tx_pool->consumer_index = 0;
+	tx_pool->producer_index = 0;
+	tx_pool->num_buffers = num_entries;
+	tx_pool->buf_size = buf_size;
+
+	return 0;
+}
+
 static int init_tx_pools(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-	struct device *dev = &adapter->vdev->dev;
-	struct ibmvnic_tx_pool *tx_pool;
 	int tx_subcrqs;
-	int i, j;
+	int i, rc;
 
 	tx_subcrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
 	adapter->tx_pool = kcalloc(tx_subcrqs,
@@ -649,53 +679,29 @@ static int init_tx_pools(struct net_device *netdev)
 	if (!adapter->tx_pool)
 		return -1;
 
+	adapter->tso_pool = kcalloc(tx_subcrqs,
+				    sizeof(struct ibmvnic_tx_pool), GFP_KERNEL);
+	if (!adapter->tso_pool)
+		return -1;
+
 	adapter->num_active_tx_pools = tx_subcrqs;
 
 	for (i = 0; i < tx_subcrqs; i++) {
-		tx_pool = &adapter->tx_pool[i];
-
-		netdev_dbg(adapter->netdev,
-			   "Initializing tx_pool[%d], %lld buffs\n",
-			   i, adapter->req_tx_entries_per_subcrq);
-
-		tx_pool->tx_buff = kcalloc(adapter->req_tx_entries_per_subcrq,
-					   sizeof(struct ibmvnic_tx_buff),
-					   GFP_KERNEL);
-		if (!tx_pool->tx_buff) {
-			dev_err(dev, "tx pool buffer allocation failed\n");
-			release_tx_pools(adapter);
-			return -1;
-		}
-
-		if (alloc_long_term_buff(adapter, &tx_pool->long_term_buff,
-					 adapter->req_tx_entries_per_subcrq *
-					 (adapter->req_mtu + VLAN_HLEN))) {
-			release_tx_pools(adapter);
-			return -1;
-		}
-
-		/* alloc TSO ltb */
-		if (alloc_long_term_buff(adapter, &tx_pool->tso_ltb,
-					 IBMVNIC_TSO_BUFS *
-					 IBMVNIC_TSO_BUF_SZ)) {
+		rc = init_one_tx_pool(netdev, &adapter->tx_pool[i],
+				      adapter->req_tx_entries_per_subcrq,
+				      adapter->req_mtu + VLAN_HLEN);
+		if (rc) {
 			release_tx_pools(adapter);
-			return -1;
+			return rc;
 		}
 
-		tx_pool->tso_index = 0;
-
-		tx_pool->free_map = kcalloc(adapter->req_tx_entries_per_subcrq,
-					    sizeof(int), GFP_KERNEL);
-		if (!tx_pool->free_map) {
+		init_one_tx_pool(netdev, &adapter->tso_pool[i],
+				 IBMVNIC_TSO_BUFS,
+				 IBMVNIC_TSO_BUF_SZ);
+		if (rc) {
 			release_tx_pools(adapter);
-			return -1;
+			return rc;
 		}
-
-		for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++)
-			tx_pool->free_map[j] = j;
-
-		tx_pool->consumer_index = 0;
-		tx_pool->producer_index = 0;
 	}
 
 	return 0;
-- 
2.7.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox