netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback
@ 2025-12-20  3:23 Vadim Fedorenko
  2025-12-20  3:23 ` [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops Vadim Fedorenko
  2025-12-21 15:55 ` [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Willem de Bruijn
  0 siblings, 2 replies; 6+ messages in thread
From: Vadim Fedorenko @ 2025-12-20  3:23 UTC (permalink / raw)
  To: David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Willem de Bruijn, Jakub Kicinski
  Cc: Shuah Khan, Ido Schimmel, netdev, Vadim Fedorenko

Preference of nexthop with source address broke ECMP for packets with
source addresses which are not in the broadcast domain, but rather added
to loopback/dummy interfaces. Original behaviour was to balance over
nexthops while now it uses the latest nexthop from the group.

For the case with 198.51.100.1/32 assigned to dummy0 and routed using
192.0.2.0/24 and 203.0.113.0/24 networks:

2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ether d6:54:8a:ff:78:f5 brd ff:ff:ff:ff:ff:ff
    inet 198.51.100.1/32 scope global dummy0
       valid_lft forever preferred_lft forever
7: veth1@if6: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 06:ed:98:87:6d:8a brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 192.0.2.2/24 scope global veth1
       valid_lft forever preferred_lft forever
    inet6 fe80::4ed:98ff:fe87:6d8a/64 scope link proto kernel_ll
       valid_lft forever preferred_lft forever
9: veth3@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether ae:75:23:38:a0:d2 brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 203.0.113.2/24 scope global veth3
       valid_lft forever preferred_lft forever
    inet6 fe80::ac75:23ff:fe38:a0d2/64 scope link proto kernel_ll
       valid_lft forever preferred_lft forever

~ ip ro list:
default
	nexthop via 192.0.2.1 dev veth1 weight 1
	nexthop via 203.0.113.1 dev veth3 weight 1
192.0.2.0/24 dev veth1 proto kernel scope link src 192.0.2.2
203.0.113.0/24 dev veth3 proto kernel scope link src 203.0.113.2

before:
   for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
    255 veth3

after:
   for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
    122 veth1
    133 veth3

Fixes: 32607a332cfe ("ipv4: prefer multipath nexthop that matches source address")
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
---
v1 -> v2:

- add score calculation for nexthop to keep original logic
- adjust commit message to explain the config
- use dummy device instead of loopback
---

 net/ipv4/fib_semantics.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a5f3c8459758..4d3650d20ff2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2167,8 +2167,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
 {
 	struct fib_info *fi = res->fi;
 	struct net *net = fi->fib_net;
-	bool found = false;
 	bool use_neigh;
+	int score = -1;
 	__be32 saddr;
 
 	if (unlikely(res->fi->nh)) {
@@ -2180,7 +2180,7 @@ void fib_select_multipath(struct fib_result *res, int hash,
 	saddr = fl4 ? fl4->saddr : 0;
 
 	change_nexthops(fi) {
-		int nh_upper_bound;
+		int nh_upper_bound, nh_score = 0;
 
 		/* Nexthops without a carrier are assigned an upper bound of
 		 * minus one when "ignore_routes_with_linkdown" is set.
@@ -2190,24 +2190,16 @@ void fib_select_multipath(struct fib_result *res, int hash,
 		    (use_neigh && !fib_good_nh(nexthop_nh)))
 			continue;
 
-		if (!found) {
+		if (saddr && nexthop_nh->nh_saddr == saddr)
+			nh_score += 2;
+		if (hash <= nh_upper_bound)
+			nh_score++;
+		if (score < nh_score) {
 			res->nh_sel = nhsel;
 			res->nhc = &nexthop_nh->nh_common;
-			found = !saddr || nexthop_nh->nh_saddr == saddr;
+			score = nh_score;
 		}
 
-		if (hash > nh_upper_bound)
-			continue;
-
-		if (!saddr || nexthop_nh->nh_saddr == saddr) {
-			res->nh_sel = nhsel;
-			res->nhc = &nexthop_nh->nh_common;
-			return;
-		}
-
-		if (found)
-			return;
-
 	} endfor_nexthops(fi);
 }
 #endif
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops
  2025-12-20  3:23 [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Vadim Fedorenko
@ 2025-12-20  3:23 ` Vadim Fedorenko
  2025-12-21 15:59   ` Willem de Bruijn
  2025-12-21 15:55 ` [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Willem de Bruijn
  1 sibling, 1 reply; 6+ messages in thread
From: Vadim Fedorenko @ 2025-12-20  3:23 UTC (permalink / raw)
  To: David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Willem de Bruijn, Jakub Kicinski
  Cc: Shuah Khan, Ido Schimmel, netdev, Vadim Fedorenko

The test checks that with multi nexthops route the preferred route is the
one which matches source ip. In case when source ip is on dummy
interface, it checks that the routes are balanced.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
---
v1 -> v2:
- move tests to fib_tests.sh
---
 tools/testing/selftests/net/fib_tests.sh | 70 +++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index a88f797c549a..c5694cc4ddd2 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -12,7 +12,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
        ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
        ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
        ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance \
-       fib6_ra_to_static"
+       ipv4_mpath_balance_preferred fib6_ra_to_static"
 
 VERBOSE=0
 PAUSE_ON_FAIL=no
@@ -2751,6 +2751,73 @@ ipv4_mpath_balance_test()
 	forwarding_cleanup
 }
 
+get_route_dev_src()
+{
+	local pfx="$1"
+	local src="$2"
+	local out
+
+	if out=$($IP -j route get "$pfx" from "$src" | jq -re ".[0].dev"); then
+		echo "$out"
+	fi
+}
+
+ipv4_mpath_preferred()
+{
+	local src_ip=$1
+	local pref_dev=$2
+	local dev routes
+	local route0=0
+	local route1=0
+	local pref_route=0
+	num_routes=254
+
+	for i in $(seq 1 $num_routes) ; do
+		dev=$(get_route_dev_src 172.16.105.$i $src_ip)
+		if [ "$dev" = "$pref_dev" ]; then
+			pref_route=$((pref_route+1))
+		elif [ "$dev" = "veth1" ]; then
+			route0=$((route0+1))
+		elif [ "$dev" = "veth3" ]; then
+			route1=$((route1+1))
+		fi
+	done
+
+	routes=$((route0+route1))
+
+	[ "$VERBOSE" = "1" ] && echo "multipath: routes seen: ($route0,$route1,$pref_route)"
+
+	if [ x"$pref_dev" = x"" ]; then
+		[[ $routes -ge $num_routes ]] && [[ $route0 -gt 0 ]] && [[ $route1 -gt 0 ]]
+	else
+		[[ $pref_route -ge $num_routes ]]
+	fi
+
+}
+
+ipv4_mpath_balance_preferred_test()
+{
+	echo
+	echo "IPv4 multipath load balance preferred route"
+
+	forwarding_setup
+
+	$IP route add 172.16.105.0/24 \
+		nexthop via 172.16.101.2 \
+		nexthop via 172.16.103.2
+
+	ipv4_mpath_preferred 172.16.101.1 veth1
+	log_test $? 0 "IPv4 multipath loadbalance from veth1"
+
+	ipv4_mpath_preferred 172.16.103.1 veth3
+	log_test $? 0 "IPv4 multipath loadbalance from veth3"
+
+	ipv4_mpath_preferred 198.51.100.1
+	log_test $? 0 "IPv4 multipath loadbalance from dummy"
+
+	forwarding_cleanup
+}
+
 ipv6_mpath_balance_test()
 {
 	echo
@@ -2861,6 +2928,7 @@ do
 	ipv6_mpath_list)		ipv6_mpath_list_test;;
 	ipv4_mpath_balance)		ipv4_mpath_balance_test;;
 	ipv6_mpath_balance)		ipv6_mpath_balance_test;;
+	ipv4_mpath_balance_preferred)	ipv4_mpath_balance_preferred_test;;
 	fib6_ra_to_static)		fib6_ra_to_static;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback
  2025-12-20  3:23 [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Vadim Fedorenko
  2025-12-20  3:23 ` [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops Vadim Fedorenko
@ 2025-12-21 15:55 ` Willem de Bruijn
  2025-12-21 16:58   ` Ido Schimmel
  1 sibling, 1 reply; 6+ messages in thread
From: Willem de Bruijn @ 2025-12-21 15:55 UTC (permalink / raw)
  To: Vadim Fedorenko, David S. Miller, David Ahern, Eric Dumazet,
	Paolo Abeni, Simon Horman, Willem de Bruijn, Jakub Kicinski
  Cc: Shuah Khan, Ido Schimmel, netdev, Vadim Fedorenko

Vadim Fedorenko wrote:
> Preference of nexthop with source address broke ECMP for packets with
> source addresses which are not in the broadcast domain, but rather added
> to loopback/dummy interfaces. Original behaviour was to balance over
> nexthops while now it uses the latest nexthop from the group.
> 
> For the case with 198.51.100.1/32 assigned to dummy0 and routed using
> 192.0.2.0/24 and 203.0.113.0/24 networks:
> 
> 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
>     link/ether d6:54:8a:ff:78:f5 brd ff:ff:ff:ff:ff:ff
>     inet 198.51.100.1/32 scope global dummy0
>        valid_lft forever preferred_lft forever
> 7: veth1@if6: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
>     link/ether 06:ed:98:87:6d:8a brd ff:ff:ff:ff:ff:ff link-netnsid 0
>     inet 192.0.2.2/24 scope global veth1
>        valid_lft forever preferred_lft forever
>     inet6 fe80::4ed:98ff:fe87:6d8a/64 scope link proto kernel_ll
>        valid_lft forever preferred_lft forever
> 9: veth3@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
>     link/ether ae:75:23:38:a0:d2 brd ff:ff:ff:ff:ff:ff link-netnsid 0
>     inet 203.0.113.2/24 scope global veth3
>        valid_lft forever preferred_lft forever
>     inet6 fe80::ac75:23ff:fe38:a0d2/64 scope link proto kernel_ll
>        valid_lft forever preferred_lft forever
> 
> ~ ip ro list:
> default
> 	nexthop via 192.0.2.1 dev veth1 weight 1
> 	nexthop via 203.0.113.1 dev veth3 weight 1
> 192.0.2.0/24 dev veth1 proto kernel scope link src 192.0.2.2
> 203.0.113.0/24 dev veth3 proto kernel scope link src 203.0.113.2
> 
> before:
>    for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
>     255 veth3
> 
> after:
>    for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
>     122 veth1
>     133 veth3
> 
> Fixes: 32607a332cfe ("ipv4: prefer multipath nexthop that matches source address")
> Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
> ---
> v1 -> v2:
> 
> - add score calculation for nexthop to keep original logic
> - adjust commit message to explain the config
> - use dummy device instead of loopback
> ---
> 
>  net/ipv4/fib_semantics.c | 24 ++++++++----------------
>  1 file changed, 8 insertions(+), 16 deletions(-)
> 
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index a5f3c8459758..4d3650d20ff2 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -2167,8 +2167,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
>  {
>  	struct fib_info *fi = res->fi;
>  	struct net *net = fi->fib_net;
> -	bool found = false;
>  	bool use_neigh;
> +	int score = -1;
>  	__be32 saddr;
>  
>  	if (unlikely(res->fi->nh)) {
> @@ -2180,7 +2180,7 @@ void fib_select_multipath(struct fib_result *res, int hash,
>  	saddr = fl4 ? fl4->saddr : 0;
>  
>  	change_nexthops(fi) {
> -		int nh_upper_bound;
> +		int nh_upper_bound, nh_score = 0;
>  
>  		/* Nexthops without a carrier are assigned an upper bound of
>  		 * minus one when "ignore_routes_with_linkdown" is set.
> @@ -2190,24 +2190,16 @@ void fib_select_multipath(struct fib_result *res, int hash,
>  		    (use_neigh && !fib_good_nh(nexthop_nh)))
>  			continue;
>  
> -		if (!found) {
> +		if (saddr && nexthop_nh->nh_saddr == saddr)
> +			nh_score += 2;
> +		if (hash <= nh_upper_bound)
> +			nh_score++;
> +		if (score < nh_score) {
>  			res->nh_sel = nhsel;
>  			res->nhc = &nexthop_nh->nh_common;
> -			found = !saddr || nexthop_nh->nh_saddr == saddr;

if score == 3 return immediately?

> +			score = nh_score;
>  		}
>  
> -		if (hash > nh_upper_bound)
> -			continue;
> -
> -		if (!saddr || nexthop_nh->nh_saddr == saddr) {
> -			res->nh_sel = nhsel;
> -			res->nhc = &nexthop_nh->nh_common;
> -			return;
> -		}
> -
> -		if (found)
> -			return;
> -
>  	} endfor_nexthops(fi);
>  }
>  #endif
> -- 
> 2.47.3
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops
  2025-12-20  3:23 ` [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops Vadim Fedorenko
@ 2025-12-21 15:59   ` Willem de Bruijn
  0 siblings, 0 replies; 6+ messages in thread
From: Willem de Bruijn @ 2025-12-21 15:59 UTC (permalink / raw)
  To: Vadim Fedorenko, David S. Miller, David Ahern, Eric Dumazet,
	Paolo Abeni, Simon Horman, Willem de Bruijn, Jakub Kicinski
  Cc: Shuah Khan, Ido Schimmel, netdev, Vadim Fedorenko

Vadim Fedorenko wrote:
> The test checks that with multi nexthops route the preferred route is the
> one which matches source ip. In case when source ip is on dummy
> interface, it checks that the routes are balanced.
> 
> Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>

Reviewed-by: Willem de Bruijn <willemb@google.com>

> ---
> v1 -> v2:
> - move tests to fib_tests.sh
> ---
>  tools/testing/selftests/net/fib_tests.sh | 70 +++++++++++++++++++++++-
>  1 file changed, 69 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
> index a88f797c549a..c5694cc4ddd2 100755
> --- a/tools/testing/selftests/net/fib_tests.sh
> +++ b/tools/testing/selftests/net/fib_tests.sh
> @@ -12,7 +12,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \
>         ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \
>         ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \
>         ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance \
> -       fib6_ra_to_static"
> +       ipv4_mpath_balance_preferred fib6_ra_to_static"
>  
>  VERBOSE=0
>  PAUSE_ON_FAIL=no
> @@ -2751,6 +2751,73 @@ ipv4_mpath_balance_test()
>  	forwarding_cleanup
>  }
>  
> +get_route_dev_src()
> +{
> +	local pfx="$1"
> +	local src="$2"

only with my highly pedantic hat on, and only if respinning: these can be local -r

> +	local out
> +
> +	if out=$($IP -j route get "$pfx" from "$src" | jq -re ".[0].dev"); then
> +		echo "$out"
> +	fi
> +}
> +
> +ipv4_mpath_preferred()
> +{
> +	local src_ip=$1
> +	local pref_dev=$2
> +	local dev routes
> +	local route0=0
> +	local route1=0
> +	local pref_route=0
> +	num_routes=254
> +
> +	for i in $(seq 1 $num_routes) ; do
> +		dev=$(get_route_dev_src 172.16.105.$i $src_ip)

Similarly, I was going to ask to avoid open coding the ip prefixes
repeatedly. But that is the style in this file, so fine to follow.

> +		if [ "$dev" = "$pref_dev" ]; then
> +			pref_route=$((pref_route+1))
> +		elif [ "$dev" = "veth1" ]; then
> +			route0=$((route0+1))
> +		elif [ "$dev" = "veth3" ]; then
> +			route1=$((route1+1))
> +		fi
> +	done
> +
> +	routes=$((route0+route1))
> +
> +	[ "$VERBOSE" = "1" ] && echo "multipath: routes seen: ($route0,$route1,$pref_route)"
> +
> +	if [ x"$pref_dev" = x"" ]; then
> +		[[ $routes -ge $num_routes ]] && [[ $route0 -gt 0 ]] && [[ $route1 -gt 0 ]]
> +	else
> +		[[ $pref_route -ge $num_routes ]]
> +	fi
> +
> +}
> +
> +ipv4_mpath_balance_preferred_test()
> +{
> +	echo
> +	echo "IPv4 multipath load balance preferred route"
> +
> +	forwarding_setup
> +
> +	$IP route add 172.16.105.0/24 \
> +		nexthop via 172.16.101.2 \
> +		nexthop via 172.16.103.2
> +
> +	ipv4_mpath_preferred 172.16.101.1 veth1
> +	log_test $? 0 "IPv4 multipath loadbalance from veth1"
> +
> +	ipv4_mpath_preferred 172.16.103.1 veth3
> +	log_test $? 0 "IPv4 multipath loadbalance from veth3"
> +
> +	ipv4_mpath_preferred 198.51.100.1
> +	log_test $? 0 "IPv4 multipath loadbalance from dummy"
> +
> +	forwarding_cleanup
> +}
> +
>  ipv6_mpath_balance_test()
>  {
>  	echo
> @@ -2861,6 +2928,7 @@ do
>  	ipv6_mpath_list)		ipv6_mpath_list_test;;
>  	ipv4_mpath_balance)		ipv4_mpath_balance_test;;
>  	ipv6_mpath_balance)		ipv6_mpath_balance_test;;
> +	ipv4_mpath_balance_preferred)	ipv4_mpath_balance_preferred_test;;
>  	fib6_ra_to_static)		fib6_ra_to_static;;
>  
>  	help) echo "Test names: $TESTS"; exit 0;;
> -- 
> 2.47.3
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback
  2025-12-21 15:55 ` [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Willem de Bruijn
@ 2025-12-21 16:58   ` Ido Schimmel
  2025-12-21 18:49     ` Vadim Fedorenko
  0 siblings, 1 reply; 6+ messages in thread
From: Ido Schimmel @ 2025-12-21 16:58 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Vadim Fedorenko, David S. Miller, David Ahern, Eric Dumazet,
	Paolo Abeni, Simon Horman, Willem de Bruijn, Jakub Kicinski,
	Shuah Khan, netdev

On Sun, Dec 21, 2025 at 10:55:15AM -0500, Willem de Bruijn wrote:
> Vadim Fedorenko wrote:
> > Preference of nexthop with source address broke ECMP for packets with
> > source addresses which are not in the broadcast domain, but rather added
> > to loopback/dummy interfaces. Original behaviour was to balance over
> > nexthops while now it uses the latest nexthop from the group.
> > 
> > For the case with 198.51.100.1/32 assigned to dummy0 and routed using
> > 192.0.2.0/24 and 203.0.113.0/24 networks:
> > 
> > 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
> >     link/ether d6:54:8a:ff:78:f5 brd ff:ff:ff:ff:ff:ff
> >     inet 198.51.100.1/32 scope global dummy0
> >        valid_lft forever preferred_lft forever
> > 7: veth1@if6: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> >     link/ether 06:ed:98:87:6d:8a brd ff:ff:ff:ff:ff:ff link-netnsid 0
> >     inet 192.0.2.2/24 scope global veth1
> >        valid_lft forever preferred_lft forever
> >     inet6 fe80::4ed:98ff:fe87:6d8a/64 scope link proto kernel_ll
> >        valid_lft forever preferred_lft forever
> > 9: veth3@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> >     link/ether ae:75:23:38:a0:d2 brd ff:ff:ff:ff:ff:ff link-netnsid 0
> >     inet 203.0.113.2/24 scope global veth3
> >        valid_lft forever preferred_lft forever
> >     inet6 fe80::ac75:23ff:fe38:a0d2/64 scope link proto kernel_ll
> >        valid_lft forever preferred_lft forever
> > 
> > ~ ip ro list:
> > default
> > 	nexthop via 192.0.2.1 dev veth1 weight 1
> > 	nexthop via 203.0.113.1 dev veth3 weight 1
> > 192.0.2.0/24 dev veth1 proto kernel scope link src 192.0.2.2
> > 203.0.113.0/24 dev veth3 proto kernel scope link src 203.0.113.2
> > 
> > before:
> >    for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
> >     255 veth3
> > 
> > after:
> >    for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
> >     122 veth1
> >     133 veth3

The commit message only explains the problem, but not the solution...

> > 
> > Fixes: 32607a332cfe ("ipv4: prefer multipath nexthop that matches source address")
> > Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
> > ---
> > v1 -> v2:
> > 
> > - add score calculation for nexthop to keep original logic
> > - adjust commit message to explain the config
> > - use dummy device instead of loopback
> > ---
> > 
> >  net/ipv4/fib_semantics.c | 24 ++++++++----------------
> >  1 file changed, 8 insertions(+), 16 deletions(-)
> > 
> > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> > index a5f3c8459758..4d3650d20ff2 100644
> > --- a/net/ipv4/fib_semantics.c
> > +++ b/net/ipv4/fib_semantics.c
> > @@ -2167,8 +2167,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
> >  {
> >  	struct fib_info *fi = res->fi;
> >  	struct net *net = fi->fib_net;
> > -	bool found = false;
> >  	bool use_neigh;
> > +	int score = -1;
> >  	__be32 saddr;
> >  
> >  	if (unlikely(res->fi->nh)) {
> > @@ -2180,7 +2180,7 @@ void fib_select_multipath(struct fib_result *res, int hash,
> >  	saddr = fl4 ? fl4->saddr : 0;
> >  
> >  	change_nexthops(fi) {
> > -		int nh_upper_bound;
> > +		int nh_upper_bound, nh_score = 0;
> >  
> >  		/* Nexthops without a carrier are assigned an upper bound of
> >  		 * minus one when "ignore_routes_with_linkdown" is set.
> > @@ -2190,24 +2190,16 @@ void fib_select_multipath(struct fib_result *res, int hash,
> >  		    (use_neigh && !fib_good_nh(nexthop_nh)))
> >  			continue;
> >  
> > -		if (!found) {
> > +		if (saddr && nexthop_nh->nh_saddr == saddr)
> > +			nh_score += 2;
> > +		if (hash <= nh_upper_bound)
> > +			nh_score++;
> > +		if (score < nh_score) {
> >  			res->nh_sel = nhsel;
> >  			res->nhc = &nexthop_nh->nh_common;
> > -			found = !saddr || nexthop_nh->nh_saddr == saddr;
> 
> if score == 3 return immediately?

We can also return early in the input path (!saddr) when score is 1.
This seems to work:

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4d3650d20ff2..0caf38e44c73 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2197,6 +2197,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
 		if (score < nh_score) {
 			res->nh_sel = nhsel;
 			res->nhc = &nexthop_nh->nh_common;
+			if (nh_score == 3 || (!saddr && nh_score == 1))
+				return;
 			score = nh_score;
 		}

Tested with net/fib_tests.sh and forwarding/router_multipath.sh

> 
> > +			score = nh_score;
> >  		}
> >  
> > -		if (hash > nh_upper_bound)
> > -			continue;
> > -
> > -		if (!saddr || nexthop_nh->nh_saddr == saddr) {
> > -			res->nh_sel = nhsel;
> > -			res->nhc = &nexthop_nh->nh_common;
> > -			return;
> > -		}
> > -
> > -		if (found)
> > -			return;
> > -
> >  	} endfor_nexthops(fi);
> >  }
> >  #endif
> > -- 
> > 2.47.3
> > 
> 
> 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback
  2025-12-21 16:58   ` Ido Schimmel
@ 2025-12-21 18:49     ` Vadim Fedorenko
  0 siblings, 0 replies; 6+ messages in thread
From: Vadim Fedorenko @ 2025-12-21 18:49 UTC (permalink / raw)
  To: Ido Schimmel, Willem de Bruijn
  Cc: David S. Miller, David Ahern, Eric Dumazet, Paolo Abeni,
	Simon Horman, Willem de Bruijn, Jakub Kicinski, Shuah Khan,
	netdev

On 21/12/2025 16:58, Ido Schimmel wrote:
> On Sun, Dec 21, 2025 at 10:55:15AM -0500, Willem de Bruijn wrote:
>> Vadim Fedorenko wrote:
>>> Preference of nexthop with source address broke ECMP for packets with
>>> source addresses which are not in the broadcast domain, but rather added
>>> to loopback/dummy interfaces. Original behaviour was to balance over
>>> nexthops while now it uses the latest nexthop from the group.
>>>
>>> For the case with 198.51.100.1/32 assigned to dummy0 and routed using
>>> 192.0.2.0/24 and 203.0.113.0/24 networks:
>>>
>>> 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
>>>      link/ether d6:54:8a:ff:78:f5 brd ff:ff:ff:ff:ff:ff
>>>      inet 198.51.100.1/32 scope global dummy0
>>>         valid_lft forever preferred_lft forever
>>> 7: veth1@if6: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
>>>      link/ether 06:ed:98:87:6d:8a brd ff:ff:ff:ff:ff:ff link-netnsid 0
>>>      inet 192.0.2.2/24 scope global veth1
>>>         valid_lft forever preferred_lft forever
>>>      inet6 fe80::4ed:98ff:fe87:6d8a/64 scope link proto kernel_ll
>>>         valid_lft forever preferred_lft forever
>>> 9: veth3@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
>>>      link/ether ae:75:23:38:a0:d2 brd ff:ff:ff:ff:ff:ff link-netnsid 0
>>>      inet 203.0.113.2/24 scope global veth3
>>>         valid_lft forever preferred_lft forever
>>>      inet6 fe80::ac75:23ff:fe38:a0d2/64 scope link proto kernel_ll
>>>         valid_lft forever preferred_lft forever
>>>
>>> ~ ip ro list:
>>> default
>>> 	nexthop via 192.0.2.1 dev veth1 weight 1
>>> 	nexthop via 203.0.113.1 dev veth3 weight 1
>>> 192.0.2.0/24 dev veth1 proto kernel scope link src 192.0.2.2
>>> 203.0.113.0/24 dev veth3 proto kernel scope link src 203.0.113.2
>>>
>>> before:
>>>     for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
>>>      255 veth3
>>>
>>> after:
>>>     for i in {1..255} ; do ip ro get 10.0.0.$i; done | grep veth | awk ' {print $(NF-2)}' | sort | uniq -c:
>>>      122 veth1
>>>      133 veth3
> 
> The commit message only explains the problem, but not the solution...

Well, the solution is to try to restore original logic. But ok, I'll
explain it explicitly

> 
>>>
>>> Fixes: 32607a332cfe ("ipv4: prefer multipath nexthop that matches source address")
>>> Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
>>> ---
>>> v1 -> v2:
>>>
>>> - add score calculation for nexthop to keep original logic
>>> - adjust commit message to explain the config
>>> - use dummy device instead of loopback
>>> ---
>>>
>>>   net/ipv4/fib_semantics.c | 24 ++++++++----------------
>>>   1 file changed, 8 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>>> index a5f3c8459758..4d3650d20ff2 100644
>>> --- a/net/ipv4/fib_semantics.c
>>> +++ b/net/ipv4/fib_semantics.c
>>> @@ -2167,8 +2167,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
>>>   {
>>>   	struct fib_info *fi = res->fi;
>>>   	struct net *net = fi->fib_net;
>>> -	bool found = false;
>>>   	bool use_neigh;
>>> +	int score = -1;
>>>   	__be32 saddr;
>>>   
>>>   	if (unlikely(res->fi->nh)) {
>>> @@ -2180,7 +2180,7 @@ void fib_select_multipath(struct fib_result *res, int hash,
>>>   	saddr = fl4 ? fl4->saddr : 0;
>>>   
>>>   	change_nexthops(fi) {
>>> -		int nh_upper_bound;
>>> +		int nh_upper_bound, nh_score = 0;
>>>   
>>>   		/* Nexthops without a carrier are assigned an upper bound of
>>>   		 * minus one when "ignore_routes_with_linkdown" is set.
>>> @@ -2190,24 +2190,16 @@ void fib_select_multipath(struct fib_result *res, int hash,
>>>   		    (use_neigh && !fib_good_nh(nexthop_nh)))
>>>   			continue;
>>>   
>>> -		if (!found) {
>>> +		if (saddr && nexthop_nh->nh_saddr == saddr)
>>> +			nh_score += 2;
>>> +		if (hash <= nh_upper_bound)
>>> +			nh_score++;
>>> +		if (score < nh_score) {
>>>   			res->nh_sel = nhsel;
>>>   			res->nhc = &nexthop_nh->nh_common;
>>> -			found = !saddr || nexthop_nh->nh_saddr == saddr;
>>
>> if score == 3 return immediately?
> 
> We can also return early in the input path (!saddr) when score is 1.
> This seems to work:
> 
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 4d3650d20ff2..0caf38e44c73 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -2197,6 +2197,8 @@ void fib_select_multipath(struct fib_result *res, int hash,
>   		if (score < nh_score) {
>   			res->nh_sel = nhsel;
>   			res->nhc = &nexthop_nh->nh_common;
> +			if (nh_score == 3 || (!saddr && nh_score == 1))
> +				return;
>   			score = nh_score;
>   		}
> 

It makes sense to amortize the loop. Going to send v3

> Tested with net/fib_tests.sh and forwarding/router_multipath.sh
> 
>>
>>> +			score = nh_score;
>>>   		}
>>>   
>>> -		if (hash > nh_upper_bound)
>>> -			continue;
>>> -
>>> -		if (!saddr || nexthop_nh->nh_saddr == saddr) {
>>> -			res->nh_sel = nhsel;
>>> -			res->nhc = &nexthop_nh->nh_common;
>>> -			return;
>>> -		}
>>> -
>>> -		if (found)
>>> -			return;
>>> -
>>>   	} endfor_nexthops(fi);
>>>   }
>>>   #endif
>>> -- 
>>> 2.47.3
>>>
>>
>>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-12-21 18:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-20  3:23 [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Vadim Fedorenko
2025-12-20  3:23 ` [PATCH net v2 2/2] selftests: fib_test: Add test case for ipv4 multi nexthops Vadim Fedorenko
2025-12-21 15:59   ` Willem de Bruijn
2025-12-21 15:55 ` [PATCH net v2 1/2] net: fib: restore ECMP balance from loopback Willem de Bruijn
2025-12-21 16:58   ` Ido Schimmel
2025-12-21 18:49     ` Vadim Fedorenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).