All of lore.kernel.org
 help / color / mirror / Atom feed
From: Benjamin Poirier <bpoirier@nvidia.com>
To: Mike Manning <mvrmanning@gmail.com>
Cc: Netdev <netdev@vger.kernel.org>, David Ahern <dsahern@gmail.com>,
	Saikrishna Arcot <sarcot@microsoft.com>,
	Craig Gallek <kraig@google.com>
Subject: Re: [PATCH] net: prefer socket bound to interface when not in VRF
Date: Mon, 13 Jun 2022 12:14:46 +0900	[thread overview]
Message-ID: <YqarphOzFTnQRq29@d3> (raw)
In-Reply-To: <cf0a8523-b362-1edf-ee78-eef63cbbb428@gmail.com>

On 2021-10-05 14:03 +0100, Mike Manning wrote:
[...]
> 
> Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF")
> Fixes: e78190581aff ("net: ensure unbound stream socket to be chosen when not in a VRF")
> Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
> ---
> 
> diff nettest-baseline-9e9fb7655ed5.txt nettest-fix.txt
> 955,956c955,956
> < TEST: IPv4 TCP connection over VRF with SNAT                                  [FAIL]
> < TEST: IPv6 TCP connection over VRF with SNAT                                  [FAIL]
> ---
> > TEST: IPv4 TCP connection over VRF with SNAT                                  [ OK ]
> > TEST: IPv6 TCP connection over VRF with SNAT                                  [ OK ]
> 958,959c958,959
> < Tests passed: 713
> < Tests failed:   5
> ---
> > Tests passed: 715
> > Tests failed:   3
> 
> ---
>  net/ipv4/inet_hashtables.c  | 4 +++-
>  net/ipv4/udp.c              | 3 ++-
>  net/ipv6/inet6_hashtables.c | 2 +-
>  net/ipv6/udp.c              | 3 ++-
>  4 files changed, 8 insertions(+), 4 deletions(-)
> 

Hi Mike,

I was looking at this commit, 8d6c414cd2fb ("net: prefer socket bound to
interface when not in VRF"), and I get the feeling that it is only
partially effective. It works with UDP connected sockets but it doesn't
work for TCP and UDP unconnected sockets.

The compute_score() functions are a bit misleading. Because of the
reuseport shortcut in their callers (inet_lhash2_lookup() and the like),
the first socket with score > 0 may be chosen, not necessarily the
socket with highest score. In order to prefer certain sockets, I think
an approach like commit d894ba18d4e4 ("soreuseport: fix ordering for
mixed v4/v6 sockets") would be needed. What do you think?

Extra info:
1) fcnal-test.sh results

I tried to reproduce the fcnal-test.sh test results quoted above but in
my case the test cases already pass at 8d6c414cd2fb^ and 9e9fb7655ed5.
Moreover I believe those test cases don't have multiple listening
sockets. So that just added to my confusion.

Running 9e9fb7655ed5,
root@vsid:/src/linux/tools/testing/selftests/net# ./fcnal-test.sh -t use_cases
[...]

#################################################################
SNAT on VRF

TEST: IPv4 TCP connection over VRF with SNAT                                  [ OK ]
TEST: IPv6 TCP connection over VRF with SNAT                                  [ OK ]

Tests passed:  16
Tests failed:   0


2) reuseport_bindtodevice test

I wrote a selftest based on
tools/testing/selftests/net/reuseport_addr_any.c It tests that listening
sockets that have SO_BINDTODEVICE set are preferred over ones that do
not. All of the sockets have SO_REUSEPORT set. I ran it over a few
relevant revisions:

               IPv4                       IPv6
HEAD           TCP  UDP unconn  UDP conn  TCP  UDP unconn  UDP conn
6a5ef90c58da^  ✔    ✔           ✔         ✔    ✔           ✔   
6a5ef90c58da   ✔    ✘           ✔         ✔    ✘           ✔   
fd1914b2901b   ✘    ✘           ✔         ✘    ✘           ✔   
7e225619e8af   ✘    ✘           ✘         ✘    ✘           ✘   
8d6c414cd2fb   ✘    ✘           ✔         ✘    ✘           ✔   

✔ pass
✘ fail

reuseport_bindtodevice.c:

// SPDX-License-Identifier: GPL-2.0

/* Test that listening sockets that have SO_BINDTODEVICE set are preferred
 * over ones that do not. All of the sockets have SO_REUSEPORT set.
 */

#define _GNU_SOURCE

#include <arpa/inet.h>
#include <errno.h>
#include <error.h>
#include <linux/in.h>
#include <linux/unistd.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>


static const int SEND_PORT = 8888;
static const int RECV_PORT = 8889;

static const char *get_family_name(int domain)
{
	if (domain == AF_INET)
		return "IPv4";
	else if (domain == AF_INET6)
		return "IPv6";
	else
		error(1, 0, "Unknown address family \"%d\"", domain);

	return "";
}

static void build_rcv_fd(int domain, int type, int *rcv_fds, int count,
			 const char *ifname, bool do_connect)
{
	struct sockaddr_storage saddr, daddr;
	int opt, i;

	if (domain == AF_INET) {
		struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr,
				   *daddr4 = (struct sockaddr_in *)&daddr;

		saddr4->sin_family = AF_INET;
		saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
		saddr4->sin_port = htons(RECV_PORT);

		daddr4->sin_family = AF_INET;
		daddr4->sin_addr.s_addr = htonl(INADDR_ANY);
		daddr4->sin_port = htons(SEND_PORT);
	} else if (domain == AF_INET6) {
		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr,
				    *daddr6 = (struct sockaddr_in6 *)&daddr;

		saddr6->sin6_family = AF_INET6;
		saddr6->sin6_addr = in6addr_any;
		saddr6->sin6_port = htons(RECV_PORT);

		daddr6->sin6_family = AF_INET6;
		daddr6->sin6_addr = in6addr_any;
		daddr6->sin6_port = htons(SEND_PORT);
	} else {
		error(1, 0, "Unsupported family %d", domain);
	}

	for (i = 0; i < count; ++i) {
		rcv_fds[i] = socket(domain, type, 0);
		if (rcv_fds[i] < 0)
			error(1, errno, "failed to create receive socket");

		opt = 1;
		if (setsockopt(rcv_fds[i], SOL_SOCKET, SO_REUSEPORT, &opt,
			       sizeof(opt)))
			error(1, errno, "failed to set SO_REUSEPORT");

		if (ifname && setsockopt(rcv_fds[i], SOL_SOCKET,
					 SO_BINDTODEVICE, ifname,
					 strlen(ifname)))
			error(1, errno, "failed to set SO_BINDTODEVICE");

		if (bind(rcv_fds[i], (struct sockaddr *)&saddr, sizeof(saddr)))
			error(1, errno, "failed to bind receive socket");

		if (do_connect &&
		    connect(rcv_fds[i], (struct sockaddr *)&daddr,
			    sizeof(daddr)))
			error(1, errno, "failed to connect receive socket");

		if (type == SOCK_STREAM && listen(rcv_fds[i], 10))
			error(1, errno, "failed to listen on receive socket");
	}
}

static int connect_and_send(int domain, int type)
{
	struct sockaddr_storage saddr, daddr;
	int fd;

	if (domain == AF_INET) {
		struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr,
				   *daddr4 = (struct sockaddr_in *)&daddr;

		saddr4->sin_family = AF_INET;
		saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
		saddr4->sin_port = htons(SEND_PORT);

		daddr4->sin_family = AF_INET;
		daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		daddr4->sin_port = htons(RECV_PORT);
	} else if (domain == AF_INET6) {
		struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr,
				    *daddr6 = (struct sockaddr_in6 *)&daddr;

		saddr6->sin6_family = AF_INET6;
		saddr6->sin6_addr = in6addr_any;
		saddr6->sin6_port = htons(SEND_PORT);

		daddr6->sin6_family = AF_INET6;
		daddr6->sin6_addr = in6addr_loopback;
		daddr6->sin6_port = htons(RECV_PORT);
	} else {
		error(1, 0, "Unsupported family %d", domain);
	}

	fd = socket(domain, type, 0);
	if (fd < 0)
		error(1, errno, "failed to create send socket");

	if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
		error(1, errno, "failed to bind send socket");

	if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
		error(1, errno, "failed to connect send socket");

	if (send(fd, "a", 1, 0) < 0)
		error(1, errno, "failed to send message");

	return fd;
}

static int receive_once(int epfd, int type)
{
	struct epoll_event ev;
	int i, fd;
	char buf[8];

	i = epoll_wait(epfd, &ev, 1, 3);
	if (i < 0)
		error(1, errno, "epoll_wait failed");
	else if (i == 0)
		error(1, errno, "no socket is ready");

	if (type == SOCK_STREAM) {
		fd = accept(ev.data.fd, NULL, NULL);
		if (fd < 0)
			error(1, errno, "failed to accept");
		i = recv(fd, buf, sizeof(buf), 0);
		close(fd);
	} else {
		i = recv(ev.data.fd, buf, sizeof(buf), 0);
	}

	if (i < 0)
		error(1, errno, "failed to recv");

	return ev.data.fd;
}

static int test(int *rcv_fds, int count, int domain, int type, int fd)
{
	int epfd, i, send_fd, recv_fd;
	struct epoll_event ev;

	epfd = epoll_create(1);
	if (epfd < 0)
		error(1, errno, "failed to create epoll");

	ev.events = EPOLLIN;
	for (i = 0; i < count; ++i) {
		ev.data.fd = rcv_fds[i];
		if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fds[i], &ev))
			error(1, errno, "failed to register sock epoll");
	}

	send_fd = connect_and_send(domain, type);

	recv_fd = receive_once(epfd, type);

	close(send_fd);
	close(epfd);

	return recv_fd == fd;
}

static int run_one_test(int domain, int type, bool do_connect)
{
	/* Below we test that a socket listening with SO_BINDTODEVICE set is
	 * always selected in preference over a socket listening without. Bugs
	 * where this is not the case often result in sockets created first or
	 * last to get picked. So below we make sure that there are always
	 * sockets with SO_BINDTODEVICE created before and after a specific
	 * socket is created.
	 */
	int rcv_fds[10], i, result;

	build_rcv_fd(AF_INET, type, rcv_fds, 2, NULL, do_connect);
	build_rcv_fd(AF_INET6, type, rcv_fds + 2, 2, NULL, do_connect);
	build_rcv_fd(domain, type, rcv_fds + 4, 1, "lo", do_connect);
	build_rcv_fd(AF_INET, type, rcv_fds + 5, 2, NULL, do_connect);
	build_rcv_fd(AF_INET6, type, rcv_fds + 7, 2, NULL, do_connect);
	result = test(rcv_fds, 9, domain, type, rcv_fds[4]);
	for (i = 0; i < 9; ++i)
		close(rcv_fds[i]);
	if (result)
		fprintf(stderr, "pass\n");
	else
		fprintf(stderr, "fail\n");
	return result;
}

static int test_family(int domain)
{
	int result = 1;

	fprintf(stderr, "%s TCP ... ", get_family_name(domain));
	result &= run_one_test(domain, SOCK_STREAM, false);

	fprintf(stderr, "%s UDP unconnected ... ", get_family_name(domain));
	result &= run_one_test(domain, SOCK_DGRAM, false);

	fprintf(stderr, "%s UDP connected ... ", get_family_name(domain));
	result &= run_one_test(domain, SOCK_DGRAM, true);

	return result;
}

int main(void)
{
	int result = 1;

	result &= test_family(AF_INET);
	result &= test_family(AF_INET6);

	if (result) {
		fprintf(stderr, "SUCCESS\n");
		return 0;
	}
	fprintf(stderr, "FAIL\n");
	return 1;
}

  parent reply	other threads:[~2022-06-13  3:14 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-05 13:03 [PATCH] net: prefer socket bound to interface when not in VRF Mike Manning
2021-10-07 14:07 ` Jakub Kicinski
2021-10-07 14:10   ` David Ahern
2021-10-07 14:27     ` Jakub Kicinski
2022-06-13  3:14 ` Benjamin Poirier [this message]
2022-06-13  3:52   ` David Ahern
2022-06-14  0:39     ` Benjamin Poirier
2022-06-26 22:25   ` Mike Manning
2022-06-27  0:05     ` Benjamin Poirier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YqarphOzFTnQRq29@d3 \
    --to=bpoirier@nvidia.com \
    --cc=dsahern@gmail.com \
    --cc=kraig@google.com \
    --cc=mvrmanning@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=sarcot@microsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.