[RFC] net: add new socket option SO

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC] net: add new socket option SO_SETNETNS
@ 2023-02-01 19:22 aloktiagi
  2023-02-02  1:48 ` Hillf Danton
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: aloktiagi @ 2023-02-01 19:22 UTC (permalink / raw)
  To: ebiederm
  Cc: davem, edumazet, kuba, pabeni, netdev, linux-kernel, tycho,
	sargun, aloktiagi

This socket option provides a mechanism for users to switch a sockets network
namespace. This enables use cases where multiple IPv6 only network namespaces
can use a single IPv4 network namespace for IPv4 only egress connectivity by
switching their sockets from IPv6 to IPv4 network namespace. This allows for
migration of systems to IPv6 only while keeping their connectivity to IPv4 only
destinations intact.

Today, we achieve this by setting up seccomp filter to intercept network system
calls like connect() from a container in a container manager which runs in an
IPv4 only network namespace. The container manager creates a new IPv4 connection
and injects the new file descriptor through SECCOMP_NOTIFY_IOCTL_ADDFD replacing
the original file descriptor from the connect() call. This does not work for
cases where the original file descriptor is handed off to a system like epoll
before the connect() call. After a new file descriptor is injected the original
file descriptor being referenced by the epoll fd is not longer valid leading to
failures. As a workaround the container manager when intercepting connect()
loops through all open socket file descriptors to check if they are referencing
the socket attempting the connect() and replace the reference with the to be
injected file descriptor. This workaround is cumbersome and makes the solution
prone to similar yet to be discovered issues.

With SO_SETNETNS, the container manager can simply switch the original
unconnected socket’s network namespace to the IPv4 only network namespace
without the need for injecting any new socket. The container can then proceed
with the connect() call and establish connectivity to the IPv4 only destination.

This socket option is only allowed for sockets that have never been connected
since connected or recently disconnected sockets maybe bound to their network
namespaces network device and switching their namespace may lead to undefined
behavior.

Signed-off-by: aloktiagi <aloktiagi@gmail.com>
---
 include/uapi/asm-generic/socket.h          |   2 +
 net/core/sock.c                            |  46 +++++
 tools/testing/selftests/net/Makefile       |   1 +
 tools/testing/selftests/net/so_set_netns.c | 208 +++++++++++++++++++++
 4 files changed, 257 insertions(+)
 create mode 100644 tools/testing/selftests/net/so_set_netns.c

diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..dc9498233fe5 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,8 @@
 
 #define SO_RCVMARK		75
 
+#define SO_SETNETNS		76
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/sock.c b/net/core/sock.c
index f954d5893e79..34cb72b211a6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
 		break;
 
+	case SO_SETNETNS:
+	{
+		struct net *other_ns, *my_ns;
+
+		if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		other_ns = get_net_ns_by_fd(val);
+		if (IS_ERR(other_ns)) {
+			ret = PTR_ERR(other_ns);
+			break;
+		}
+
+		if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
+			ret = -EPERM;
+			goto out_err;
+		}
+
+		/* check that the socket has never been connected or recently disconnected */
+		if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
+			ret = -EOPNOTSUPP;
+			goto out_err;
+		}
+
+		/* check that the socket is not bound to an interface*/
+		if (sk->sk_bound_dev_if != 0) {
+			ret = -EOPNOTSUPP;
+			goto out_err;
+		}
+
+		my_ns = sock_net(sk);
+		sock_net_set(sk, other_ns);
+		put_net(my_ns);
+		break;
+out_err:
+		put_net(other_ns);
+		break;
+	}
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3007e98a6d64..c2e7679e31bb 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -75,6 +75,7 @@ TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
 TEST_GEN_FILES += sctp_hello
 TEST_GEN_FILES += csum
+TEST_GEN_PROGS += so_set_netns
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/so_set_netns.c b/tools/testing/selftests/net/so_set_netns.c
new file mode 100644
index 000000000000..cc7767d23a5d
--- /dev/null
+++ b/tools/testing/selftests/net/so_set_netns.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <linux/tcp.h>
+#include <linux/socket.h>
+
+#include <sys/types.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef SO_SETNETNS
+#define SO_SETNETNS            76
+#endif
+
+static int unshare_open(void)
+{
+	const char *netns_path = "/proc/self/ns/net";
+	int fd, ret;
+
+	if (unshare(CLONE_NEWNET) != 0)
+		return -1;
+
+	fd = open(netns_path, O_RDONLY);
+	if (fd <= 0)
+		return -1;
+
+	ret = system("ip link set lo up");
+	if (ret < 0)
+		return -1;
+
+	return fd;
+}
+
+static int switch_ns(int fd)
+{
+	if (setns(fd, CLONE_NEWNET))
+		return -1;
+	return 0;
+}
+
+static void init_namespaces(struct __test_metadata *_metadata,
+			   int *netns_client, int *netns_server)
+{
+	*netns_client = unshare_open();
+	ASSERT_GE(*netns_client, 0);
+
+	*netns_server = unshare_open();
+	ASSERT_GE(*netns_server, 0);
+}
+
+static void setup_network(struct __test_metadata *_metadata,
+			  int *netns_client, int *netns_server)
+{
+	int ret;
+
+	ret = switch_ns(*netns_client);
+	ASSERT_EQ(ret, 0);
+
+	ret = system("ip addr add fd::1/64 dev lo");
+	ASSERT_EQ(ret, 0);
+
+	ret = switch_ns(*netns_server);
+	ASSERT_EQ(ret, 0);
+
+	ret = system("ip addr add 192.168.1.1/24 dev lo");
+	ASSERT_EQ(ret, 0);
+}
+
+static void setup_client_server(struct __test_metadata *_metadata,
+				int *netns_client, int *netns_server,
+			        int *client_fd, int *server_fd)
+{
+	struct sockaddr_in addr;
+	int ret;
+
+	ret = switch_ns(*netns_client);
+	ASSERT_EQ(ret, 0);
+
+	*client_fd = socket(AF_INET, SOCK_STREAM, 0);
+
+	ret = switch_ns(*netns_server);
+	ASSERT_EQ(ret, 0);
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+	addr.sin_port = htons(80);
+
+	*server_fd = socket(AF_INET, SOCK_STREAM, 0);
+	ret = bind(*server_fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+	ret = listen(*server_fd, 10);
+	ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(so_set_netns)
+{
+	int netns_client, netns_server;
+	int client_fd, server_fd;
+};
+
+FIXTURE_SETUP(so_set_netns)
+{
+	init_namespaces(_metadata, &self->netns_client, &self->netns_server);
+	setup_network(_metadata, &self->netns_client, &self->netns_server);
+	setup_client_server(_metadata,
+			    &self->netns_client, &self->netns_server,
+			    &self->client_fd, &self->server_fd);
+}
+
+FIXTURE_TEARDOWN(so_set_netns)
+{
+	close(self->client_fd);
+	close(self->server_fd);
+	close(self->netns_client);
+	close(self->netns_server);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_unconnected) {
+	struct sockaddr_in addr;
+	int ret;
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+	addr.sin_port = htons(80);
+
+	ret = switch_ns(self->netns_client);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->client_fd,
+                         SOL_SOCKET, SO_SETNETNS,
+                         &self->netns_server,
+                         sizeof(self->netns_server));
+	ASSERT_EQ(ret, 0);
+
+	ret = connect(self->client_fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_connected) {
+	struct sockaddr_in addr;
+	int ret;
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+	addr.sin_port = htons(80);
+
+	ret = setsockopt(self->client_fd,
+                         SOL_SOCKET, SO_SETNETNS,
+			 &self->netns_server,
+			 sizeof(self->netns_server));
+	ASSERT_EQ(ret, 0);
+
+	ret = connect(self->client_fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+
+	// switching network namespace of connected
+	// socket should fail
+	ret = setsockopt(self->client_fd,
+			 SOL_SOCKET, SO_SETNETNS,
+			 &self->netns_client,
+			 sizeof(self->netns_client));
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_disconnected) {
+	struct sockaddr_in addr;
+	int ret;
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+	addr.sin_port = htons(80);
+
+	ret = setsockopt(self->client_fd,
+			 SOL_SOCKET, SO_SETNETNS,
+			 &self->netns_server,
+			 sizeof(self->netns_server));
+	ASSERT_EQ(ret, 0);
+
+	ret = connect(self->client_fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+
+	close(self->server_fd);
+
+	// switching network namespace of recently disconnected
+	// socket should fail
+	ret = setsockopt(self->client_fd,
+			 SOL_SOCKET, SO_SETNETNS,
+			 &self->netns_client,
+			 sizeof(self->netns_client));
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_HARNESS_MAIN
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
@ 2023-02-02  1:48 ` Hillf Danton
  2023-02-02 19:55   ` Alok Tiagi
  2023-02-07 11:48 ` kernel test robot
  2023-02-07 14:21 ` kernel test robot
  2 siblings, 1 reply; 11+ messages in thread
From: Hillf Danton @ 2023-02-02  1:48 UTC (permalink / raw)
  To: aloktiagi; +Cc: ebiederm, edumazet, netdev, linux-mm, linux-kernel

On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
>  		WRITE_ONCE(sk->sk_txrehash, (u8)val);
>  		break;
>  
> +	case SO_SETNETNS:
> +	{
> +		struct net *other_ns, *my_ns;
> +
> +		if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> +			ret = -EOPNOTSUPP;
> +			break;
> +		}
> +
> +		if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> +			ret = -EOPNOTSUPP;
> +			break;
> +		}
> +
> +		other_ns = get_net_ns_by_fd(val);
> +		if (IS_ERR(other_ns)) {
> +			ret = PTR_ERR(other_ns);
> +			break;
> +		}
> +
> +		if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> +			ret = -EPERM;
> +			goto out_err;
> +		}
> +
> +		/* check that the socket has never been connected or recently disconnected */
> +		if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> +			ret = -EOPNOTSUPP;
> +			goto out_err;
> +		}
> +
> +		/* check that the socket is not bound to an interface*/
> +		if (sk->sk_bound_dev_if != 0) {
> +			ret = -EOPNOTSUPP;
> +			goto out_err;
> +		}
> +
> +		my_ns = sock_net(sk);
> +		sock_net_set(sk, other_ns);
> +		put_net(my_ns);
> +		break;

		cpu 0				cpu 2
		---				---
						ns = sock_net(sk);
		my_ns = sock_net(sk);
		sock_net_set(sk, other_ns);
		put_net(my_ns);
						ns is invalid ?

> +out_err:
> +		put_net(other_ns);
> +		break;
> +	}
> +
>  	default:
>  		ret = -ENOPROTOOPT;
>  		break;


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-02  1:48 ` Hillf Danton
@ 2023-02-02 19:55   ` Alok Tiagi
  2023-02-02 20:10     ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-02 19:55 UTC (permalink / raw)
  To: Hillf Danton; +Cc: ebiederm, edumazet, netdev, linux-mm, linux-kernel

On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> >  		WRITE_ONCE(sk->sk_txrehash, (u8)val);
> >  		break;
> >  
> > +	case SO_SETNETNS:
> > +	{
> > +		struct net *other_ns, *my_ns;
> > +
> > +		if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > +			ret = -EOPNOTSUPP;
> > +			break;
> > +		}
> > +
> > +		if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > +			ret = -EOPNOTSUPP;
> > +			break;
> > +		}
> > +
> > +		other_ns = get_net_ns_by_fd(val);
> > +		if (IS_ERR(other_ns)) {
> > +			ret = PTR_ERR(other_ns);
> > +			break;
> > +		}
> > +
> > +		if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > +			ret = -EPERM;
> > +			goto out_err;
> > +		}
> > +
> > +		/* check that the socket has never been connected or recently disconnected */
> > +		if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > +			ret = -EOPNOTSUPP;
> > +			goto out_err;
> > +		}
> > +
> > +		/* check that the socket is not bound to an interface*/
> > +		if (sk->sk_bound_dev_if != 0) {
> > +			ret = -EOPNOTSUPP;
> > +			goto out_err;
> > +		}
> > +
> > +		my_ns = sock_net(sk);
> > +		sock_net_set(sk, other_ns);
> > +		put_net(my_ns);
> > +		break;
> 
> 		cpu 0				cpu 2
> 		---				---
> 						ns = sock_net(sk);
> 		my_ns = sock_net(sk);
> 		sock_net_set(sk, other_ns);
> 		put_net(my_ns);
> 						ns is invalid ?

That is the reason we want the socket to be in an un-connected state. That
should help us avoid this situation.

> 
> > +out_err:
> > +		put_net(other_ns);
> > +		break;
> > +	}
> > +
> >  	default:
> >  		ret = -ENOPROTOOPT;
> >  		break;


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-02 19:55   ` Alok Tiagi
@ 2023-02-02 20:10     ` Eric Dumazet
  2023-02-02 23:58       ` Alok Tiagi
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2023-02-02 20:10 UTC (permalink / raw)
  To: Alok Tiagi; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel

On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
>
> On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > >             break;
> > >
> > > +   case SO_SETNETNS:
> > > +   {
> > > +           struct net *other_ns, *my_ns;
> > > +
> > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > +                   ret = -EOPNOTSUPP;
> > > +                   break;
> > > +           }
> > > +
> > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > +                   ret = -EOPNOTSUPP;
> > > +                   break;
> > > +           }
> > > +
> > > +           other_ns = get_net_ns_by_fd(val);
> > > +           if (IS_ERR(other_ns)) {
> > > +                   ret = PTR_ERR(other_ns);
> > > +                   break;
> > > +           }
> > > +
> > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > +                   ret = -EPERM;
> > > +                   goto out_err;
> > > +           }
> > > +
> > > +           /* check that the socket has never been connected or recently disconnected */
> > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > +                   ret = -EOPNOTSUPP;
> > > +                   goto out_err;
> > > +           }
> > > +
> > > +           /* check that the socket is not bound to an interface*/
> > > +           if (sk->sk_bound_dev_if != 0) {
> > > +                   ret = -EOPNOTSUPP;
> > > +                   goto out_err;
> > > +           }
> > > +
> > > +           my_ns = sock_net(sk);
> > > +           sock_net_set(sk, other_ns);
> > > +           put_net(my_ns);
> > > +           break;
> >
> >               cpu 0                           cpu 2
> >               ---                             ---
> >                                               ns = sock_net(sk);
> >               my_ns = sock_net(sk);
> >               sock_net_set(sk, other_ns);
> >               put_net(my_ns);
> >                                               ns is invalid ?
>
> That is the reason we want the socket to be in an un-connected state. That
> should help us avoid this situation.

This is not enough....

Another thread might look at sock_net(sk), for example from inet_diag
or tcp timers
(which can be fired even in un-connected state)

Even UDP sockets can receive packets while being un-connected,
and they need to deref the net pointer.

Currently there is no protection about sock_net(sk) being changed on the fly,
and the struct net could disappear and be freed.

There are ~1500 uses of sock_net(sk) in the kernel, I do not think
you/we want to audit all
of them to check what could go wrong...


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-02 20:10     ` Eric Dumazet
@ 2023-02-02 23:58       ` Alok Tiagi
  2023-02-03 15:09         ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-02 23:58 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel

On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >
> > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > >             break;
> > > >
> > > > +   case SO_SETNETNS:
> > > > +   {
> > > > +           struct net *other_ns, *my_ns;
> > > > +
> > > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > +                   ret = -EOPNOTSUPP;
> > > > +                   break;
> > > > +           }
> > > > +
> > > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > +                   ret = -EOPNOTSUPP;
> > > > +                   break;
> > > > +           }
> > > > +
> > > > +           other_ns = get_net_ns_by_fd(val);
> > > > +           if (IS_ERR(other_ns)) {
> > > > +                   ret = PTR_ERR(other_ns);
> > > > +                   break;
> > > > +           }
> > > > +
> > > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > +                   ret = -EPERM;
> > > > +                   goto out_err;
> > > > +           }
> > > > +
> > > > +           /* check that the socket has never been connected or recently disconnected */
> > > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > +                   ret = -EOPNOTSUPP;
> > > > +                   goto out_err;
> > > > +           }
> > > > +
> > > > +           /* check that the socket is not bound to an interface*/
> > > > +           if (sk->sk_bound_dev_if != 0) {
> > > > +                   ret = -EOPNOTSUPP;
> > > > +                   goto out_err;
> > > > +           }
> > > > +
> > > > +           my_ns = sock_net(sk);
> > > > +           sock_net_set(sk, other_ns);
> > > > +           put_net(my_ns);
> > > > +           break;
> > >
> > >               cpu 0                           cpu 2
> > >               ---                             ---
> > >                                               ns = sock_net(sk);
> > >               my_ns = sock_net(sk);
> > >               sock_net_set(sk, other_ns);
> > >               put_net(my_ns);
> > >                                               ns is invalid ?
> >
> > That is the reason we want the socket to be in an un-connected state. That
> > should help us avoid this situation.
> 
> This is not enough....
> 
> Another thread might look at sock_net(sk), for example from inet_diag
> or tcp timers
> (which can be fired even in un-connected state)
> 
> Even UDP sockets can receive packets while being un-connected,
> and they need to deref the net pointer.
> 
> Currently there is no protection about sock_net(sk) being changed on the fly,
> and the struct net could disappear and be freed.
> 
> There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> you/we want to audit all
> of them to check what could go wrong...

I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
exploration of the usage of sock_net(sk) it appeared that it might be safe to
swap a sockets net ns if it had never been connected but I looked at only a
subset of such uses.

Introducing a ref counting logic to every access of sock_net(sk) may help get
around this but invovles a bigger change to increment and decrement the count at
every use of sock_net().

Any suggestions if this could be achieved in another way much close to the
socket creation time or any comments on our workaround for injecting sockets using
seccomp addfd?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-02 23:58       ` Alok Tiagi
@ 2023-02-03 15:09         ` Eric Dumazet
  2023-02-03 17:50           ` Alok Tiagi
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2023-02-03 15:09 UTC (permalink / raw)
  To: Alok Tiagi; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel

On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
>
> On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> > >
> > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > > >             break;
> > > > >
> > > > > +   case SO_SETNETNS:
> > > > > +   {
> > > > > +           struct net *other_ns, *my_ns;
> > > > > +
> > > > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > > +                   ret = -EOPNOTSUPP;
> > > > > +                   break;
> > > > > +           }
> > > > > +
> > > > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > > +                   ret = -EOPNOTSUPP;
> > > > > +                   break;
> > > > > +           }
> > > > > +
> > > > > +           other_ns = get_net_ns_by_fd(val);
> > > > > +           if (IS_ERR(other_ns)) {
> > > > > +                   ret = PTR_ERR(other_ns);
> > > > > +                   break;
> > > > > +           }
> > > > > +
> > > > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > > +                   ret = -EPERM;
> > > > > +                   goto out_err;
> > > > > +           }
> > > > > +
> > > > > +           /* check that the socket has never been connected or recently disconnected */
> > > > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > > +                   ret = -EOPNOTSUPP;
> > > > > +                   goto out_err;
> > > > > +           }
> > > > > +
> > > > > +           /* check that the socket is not bound to an interface*/
> > > > > +           if (sk->sk_bound_dev_if != 0) {
> > > > > +                   ret = -EOPNOTSUPP;
> > > > > +                   goto out_err;
> > > > > +           }
> > > > > +
> > > > > +           my_ns = sock_net(sk);
> > > > > +           sock_net_set(sk, other_ns);
> > > > > +           put_net(my_ns);
> > > > > +           break;
> > > >
> > > >               cpu 0                           cpu 2
> > > >               ---                             ---
> > > >                                               ns = sock_net(sk);
> > > >               my_ns = sock_net(sk);
> > > >               sock_net_set(sk, other_ns);
> > > >               put_net(my_ns);
> > > >                                               ns is invalid ?
> > >
> > > That is the reason we want the socket to be in an un-connected state. That
> > > should help us avoid this situation.
> >
> > This is not enough....
> >
> > Another thread might look at sock_net(sk), for example from inet_diag
> > or tcp timers
> > (which can be fired even in un-connected state)
> >
> > Even UDP sockets can receive packets while being un-connected,
> > and they need to deref the net pointer.
> >
> > Currently there is no protection about sock_net(sk) being changed on the fly,
> > and the struct net could disappear and be freed.
> >
> > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> > you/we want to audit all
> > of them to check what could go wrong...
>
> I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> exploration of the usage of sock_net(sk) it appeared that it might be safe to
> swap a sockets net ns if it had never been connected but I looked at only a
> subset of such uses.
>
> Introducing a ref counting logic to every access of sock_net(sk) may help get
> around this but invovles a bigger change to increment and decrement the count at
> every use of sock_net().
>
> Any suggestions if this could be achieved in another way much close to the
> socket creation time or any comments on our workaround for injecting sockets using
> seccomp addfd?

Maybe the existing BPF hook in inet_create() could be used ?

err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);

The BPF program might be able to switch the netns, because at this
time the new socket is not
yet visible from external threads.

Although it is not going to catch dual stack uses (open a V6 socket,
then use a v4mapped address at bind()/connect()/...


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-03 15:09         ` Eric Dumazet
@ 2023-02-03 17:50           ` Alok Tiagi
  2023-02-03 21:17             ` Eric W. Biederman
  0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-03 17:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel

On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >
> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> > > >
> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > > > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > > > >             break;
> > > > > >
> > > > > > +   case SO_SETNETNS:
> > > > > > +   {
> > > > > > +           struct net *other_ns, *my_ns;
> > > > > > +
> > > > > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > > > +                   ret = -EOPNOTSUPP;
> > > > > > +                   break;
> > > > > > +           }
> > > > > > +
> > > > > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > > > +                   ret = -EOPNOTSUPP;
> > > > > > +                   break;
> > > > > > +           }
> > > > > > +
> > > > > > +           other_ns = get_net_ns_by_fd(val);
> > > > > > +           if (IS_ERR(other_ns)) {
> > > > > > +                   ret = PTR_ERR(other_ns);
> > > > > > +                   break;
> > > > > > +           }
> > > > > > +
> > > > > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > > > +                   ret = -EPERM;
> > > > > > +                   goto out_err;
> > > > > > +           }
> > > > > > +
> > > > > > +           /* check that the socket has never been connected or recently disconnected */
> > > > > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > > > +                   ret = -EOPNOTSUPP;
> > > > > > +                   goto out_err;
> > > > > > +           }
> > > > > > +
> > > > > > +           /* check that the socket is not bound to an interface*/
> > > > > > +           if (sk->sk_bound_dev_if != 0) {
> > > > > > +                   ret = -EOPNOTSUPP;
> > > > > > +                   goto out_err;
> > > > > > +           }
> > > > > > +
> > > > > > +           my_ns = sock_net(sk);
> > > > > > +           sock_net_set(sk, other_ns);
> > > > > > +           put_net(my_ns);
> > > > > > +           break;
> > > > >
> > > > >               cpu 0                           cpu 2
> > > > >               ---                             ---
> > > > >                                               ns = sock_net(sk);
> > > > >               my_ns = sock_net(sk);
> > > > >               sock_net_set(sk, other_ns);
> > > > >               put_net(my_ns);
> > > > >                                               ns is invalid ?
> > > >
> > > > That is the reason we want the socket to be in an un-connected state. That
> > > > should help us avoid this situation.
> > >
> > > This is not enough....
> > >
> > > Another thread might look at sock_net(sk), for example from inet_diag
> > > or tcp timers
> > > (which can be fired even in un-connected state)
> > >
> > > Even UDP sockets can receive packets while being un-connected,
> > > and they need to deref the net pointer.
> > >
> > > Currently there is no protection about sock_net(sk) being changed on the fly,
> > > and the struct net could disappear and be freed.
> > >
> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> > > you/we want to audit all
> > > of them to check what could go wrong...
> >
> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
> > swap a sockets net ns if it had never been connected but I looked at only a
> > subset of such uses.
> >
> > Introducing a ref counting logic to every access of sock_net(sk) may help get
> > around this but invovles a bigger change to increment and decrement the count at
> > every use of sock_net().
> >
> > Any suggestions if this could be achieved in another way much close to the
> > socket creation time or any comments on our workaround for injecting sockets using
> > seccomp addfd?
> 
> Maybe the existing BPF hook in inet_create() could be used ?
> 
> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
> 
> The BPF program might be able to switch the netns, because at this
> time the new socket is not
> yet visible from external threads.
> 
> Although it is not going to catch dual stack uses (open a V6 socket,
> then use a v4mapped address at bind()/connect()/...

We thought of a similar approach by intercepting the socket() call in seccomp
and injecting a new file descritpor much earlier but as you said we run into the
issue of handling dual stack sockets since we do not know in advance if its
going to be used for a v4mapped address.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-03 17:50           ` Alok Tiagi
@ 2023-02-03 21:17             ` Eric W. Biederman
  2023-02-04 18:44               ` Alok Tiagi
  0 siblings, 1 reply; 11+ messages in thread
From: Eric W. Biederman @ 2023-02-03 21:17 UTC (permalink / raw)
  To: Alok Tiagi; +Cc: Eric Dumazet, Hillf Danton, netdev, linux-mm, linux-kernel

Alok Tiagi <aloktiagi@gmail.com> writes:

> On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
>> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
>> >
>> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
>> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
>> > > >
>> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
>> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
>> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
>> > > > > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
>> > > > > >             break;
>> > > > > >
>> > > > > > +   case SO_SETNETNS:
>> > > > > > +   {
>> > > > > > +           struct net *other_ns, *my_ns;
>> > > > > > +
>> > > > > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
>> > > > > > +                   ret = -EOPNOTSUPP;
>> > > > > > +                   break;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
>> > > > > > +                   ret = -EOPNOTSUPP;
>> > > > > > +                   break;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           other_ns = get_net_ns_by_fd(val);
>> > > > > > +           if (IS_ERR(other_ns)) {
>> > > > > > +                   ret = PTR_ERR(other_ns);
>> > > > > > +                   break;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
>> > > > > > +                   ret = -EPERM;
>> > > > > > +                   goto out_err;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           /* check that the socket has never been connected or recently disconnected */
>> > > > > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
>> > > > > > +                   ret = -EOPNOTSUPP;
>> > > > > > +                   goto out_err;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           /* check that the socket is not bound to an interface*/
>> > > > > > +           if (sk->sk_bound_dev_if != 0) {
>> > > > > > +                   ret = -EOPNOTSUPP;
>> > > > > > +                   goto out_err;
>> > > > > > +           }
>> > > > > > +
>> > > > > > +           my_ns = sock_net(sk);
>> > > > > > +           sock_net_set(sk, other_ns);
>> > > > > > +           put_net(my_ns);
>> > > > > > +           break;
>> > > > >
>> > > > >               cpu 0                           cpu 2
>> > > > >               ---                             ---
>> > > > >                                               ns = sock_net(sk);
>> > > > >               my_ns = sock_net(sk);
>> > > > >               sock_net_set(sk, other_ns);
>> > > > >               put_net(my_ns);
>> > > > >                                               ns is invalid ?
>> > > >
>> > > > That is the reason we want the socket to be in an un-connected state. That
>> > > > should help us avoid this situation.
>> > >
>> > > This is not enough....
>> > >
>> > > Another thread might look at sock_net(sk), for example from inet_diag
>> > > or tcp timers
>> > > (which can be fired even in un-connected state)
>> > >
>> > > Even UDP sockets can receive packets while being un-connected,
>> > > and they need to deref the net pointer.
>> > >
>> > > Currently there is no protection about sock_net(sk) being changed on the fly,
>> > > and the struct net could disappear and be freed.
>> > >
>> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
>> > > you/we want to audit all
>> > > of them to check what could go wrong...
>> >
>> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
>> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
>> > swap a sockets net ns if it had never been connected but I looked at only a
>> > subset of such uses.
>> >
>> > Introducing a ref counting logic to every access of sock_net(sk) may help get
>> > around this but invovles a bigger change to increment and decrement the count at
>> > every use of sock_net().
>> >
>> > Any suggestions if this could be achieved in another way much close to the
>> > socket creation time or any comments on our workaround for injecting sockets using
>> > seccomp addfd?
>> 
>> Maybe the existing BPF hook in inet_create() could be used ?
>> 
>> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
>> 
>> The BPF program might be able to switch the netns, because at this
>> time the new socket is not
>> yet visible from external threads.
>> 
>> Although it is not going to catch dual stack uses (open a V6 socket,
>> then use a v4mapped address at bind()/connect()/...
>
> We thought of a similar approach by intercepting the socket() call in seccomp
> and injecting a new file descritpor much earlier but as you said we run into the
> issue of handling dual stack sockets since we do not know in advance if its
> going to be used for a v4mapped address.

I would suggest adding a default ipv4 route from your ipv6 network
namespaces to your ipv4 network namespace, but that only works for
outbound traffic.  The inbound traffic problem is classically solved
via nat.

That you are not suggesting using nat has me thinking there is something
subtle in what you are trying to do that I am missing.

Perhaps your userspace can do:

	previous_netns = open("/proc/self/ns/net");
	setns(ipv4_netns);
	socket();
	setns(previous_netns);


As the network namespace is per thread this is atomic if you add
the logic to block signals around it.

Eric


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-03 21:17             ` Eric W. Biederman
@ 2023-02-04 18:44               ` Alok Tiagi
  0 siblings, 0 replies; 11+ messages in thread
From: Alok Tiagi @ 2023-02-04 18:44 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Eric Dumazet, Hillf Danton, netdev, linux-mm, linux-kernel, tycho

On Fri, Feb 03, 2023 at 03:17:06PM -0600, Eric W. Biederman wrote:
> Alok Tiagi <aloktiagi@gmail.com> writes:
> 
> > On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
> >> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >> >
> >> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> >> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >> > > >
> >> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> >> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> >> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> >> > > > > >             WRITE_ONCE(sk->sk_txrehash, (u8)val);
> >> > > > > >             break;
> >> > > > > >
> >> > > > > > +   case SO_SETNETNS:
> >> > > > > > +   {
> >> > > > > > +           struct net *other_ns, *my_ns;
> >> > > > > > +
> >> > > > > > +           if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> >> > > > > > +                   ret = -EOPNOTSUPP;
> >> > > > > > +                   break;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> >> > > > > > +                   ret = -EOPNOTSUPP;
> >> > > > > > +                   break;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           other_ns = get_net_ns_by_fd(val);
> >> > > > > > +           if (IS_ERR(other_ns)) {
> >> > > > > > +                   ret = PTR_ERR(other_ns);
> >> > > > > > +                   break;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> >> > > > > > +                   ret = -EPERM;
> >> > > > > > +                   goto out_err;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           /* check that the socket has never been connected or recently disconnected */
> >> > > > > > +           if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> >> > > > > > +                   ret = -EOPNOTSUPP;
> >> > > > > > +                   goto out_err;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           /* check that the socket is not bound to an interface*/
> >> > > > > > +           if (sk->sk_bound_dev_if != 0) {
> >> > > > > > +                   ret = -EOPNOTSUPP;
> >> > > > > > +                   goto out_err;
> >> > > > > > +           }
> >> > > > > > +
> >> > > > > > +           my_ns = sock_net(sk);
> >> > > > > > +           sock_net_set(sk, other_ns);
> >> > > > > > +           put_net(my_ns);
> >> > > > > > +           break;
> >> > > > >
> >> > > > >               cpu 0                           cpu 2
> >> > > > >               ---                             ---
> >> > > > >                                               ns = sock_net(sk);
> >> > > > >               my_ns = sock_net(sk);
> >> > > > >               sock_net_set(sk, other_ns);
> >> > > > >               put_net(my_ns);
> >> > > > >                                               ns is invalid ?
> >> > > >
> >> > > > That is the reason we want the socket to be in an un-connected state. That
> >> > > > should help us avoid this situation.
> >> > >
> >> > > This is not enough....
> >> > >
> >> > > Another thread might look at sock_net(sk), for example from inet_diag
> >> > > or tcp timers
> >> > > (which can be fired even in un-connected state)
> >> > >
> >> > > Even UDP sockets can receive packets while being un-connected,
> >> > > and they need to deref the net pointer.
> >> > >
> >> > > Currently there is no protection about sock_net(sk) being changed on the fly,
> >> > > and the struct net could disappear and be freed.
> >> > >
> >> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> >> > > you/we want to audit all
> >> > > of them to check what could go wrong...
> >> >
> >> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> >> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
> >> > swap a sockets net ns if it had never been connected but I looked at only a
> >> > subset of such uses.
> >> >
> >> > Introducing a ref counting logic to every access of sock_net(sk) may help get
> >> > around this but invovles a bigger change to increment and decrement the count at
> >> > every use of sock_net().
> >> >
> >> > Any suggestions if this could be achieved in another way much close to the
> >> > socket creation time or any comments on our workaround for injecting sockets using
> >> > seccomp addfd?
> >> 
> >> Maybe the existing BPF hook in inet_create() could be used ?
> >> 
> >> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
> >> 
> >> The BPF program might be able to switch the netns, because at this
> >> time the new socket is not
> >> yet visible from external threads.
> >> 
> >> Although it is not going to catch dual stack uses (open a V6 socket,
> >> then use a v4mapped address at bind()/connect()/...
> >
> > We thought of a similar approach by intercepting the socket() call in seccomp
> > and injecting a new file descritpor much earlier but as you said we run into the
> > issue of handling dual stack sockets since we do not know in advance if its
> > going to be used for a v4mapped address.
> 
> I would suggest adding a default ipv4 route from your ipv6 network
> namespaces to your ipv4 network namespace, but that only works for
> outbound traffic.  The inbound traffic problem is classically solved
> via nat.
> 
> That you are not suggesting using nat has me thinking there is something
> subtle in what you are trying to do that I am missing.
> 
> Perhaps your userspace can do:
> 
> 	previous_netns = open("/proc/self/ns/net");
> 	setns(ipv4_netns);
> 	socket();
> 	setns(previous_netns);
> 
> 
> As the network namespace is per thread this is atomic if you add
> the logic to block signals around it.
> 
> Eric

That is correct, we are not using nat, but we are providing a mechanism for the
users of our container platform to move to ipv6 only while keeping egress
connectivity to their ipv4 destinations. We are doing this transparently without
any change in user code, but by intercept networking syscalls in a container
manager running in a dedicated ipv4 only network namespace. Our current solution
as described in my original commit message has limitations and we are looking
for a way to switch a sockets namespace from the ipv6 only container network
namespace to the dedicated ipv4 network namespace which really simplifies our
design.

Since our userspace is the container workload we have no control over how they
instantiate their sockets.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
  2023-02-02  1:48 ` Hillf Danton
@ 2023-02-07 11:48 ` kernel test robot
  2023-02-07 14:21 ` kernel test robot
  2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2023-02-07 11:48 UTC (permalink / raw)
  To: aloktiagi; +Cc: llvm, oe-kbuild-all

Hi aloktiagi,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes net/master linus/master v6.2-rc7]
[cannot apply to net-next/master next-20230207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link:    https://lore.kernel.org/r/Y9q8Ec1CJILZz7dj%40ip-172-31-38-16.us-west-2.compute.internal
patch subject: [RFC] net: add new socket option SO_SETNETNS
config: mips-mtx1_defconfig (https://download.01.org/0day-ci/archive/20230207/202302071930.65Ha05Kf-lkp@intel.com/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 4196ca3278f78c6e19246e54ab0ecb364e37d66a)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install mips cross compiling tool for clang build
        # apt-get install binutils-mipsel-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/03eff302351f4db1ed733d7b303f3938e511413b
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
        git checkout 03eff302351f4db1ed733d7b303f3938e511413b
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/core/sock.c:1538:7: error: use of undeclared identifier 'SO_SETNETNS'
           case SO_SETNETNS:
                ^
   1 error generated.


vim +/SO_SETNETNS +1538 net/core/sock.c

  1426	
  1427		case SO_MAX_PACING_RATE:
  1428			{
  1429			unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
  1430	
  1431			if (sizeof(ulval) != sizeof(val) &&
  1432			    optlen >= sizeof(ulval) &&
  1433			    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
  1434				ret = -EFAULT;
  1435				break;
  1436			}
  1437			if (ulval != ~0UL)
  1438				cmpxchg(&sk->sk_pacing_status,
  1439					SK_PACING_NONE,
  1440					SK_PACING_NEEDED);
  1441			sk->sk_max_pacing_rate = ulval;
  1442			sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
  1443			break;
  1444			}
  1445		case SO_INCOMING_CPU:
  1446			reuseport_update_incoming_cpu(sk, val);
  1447			break;
  1448	
  1449		case SO_CNX_ADVICE:
  1450			if (val == 1)
  1451				dst_negative_advice(sk);
  1452			break;
  1453	
  1454		case SO_ZEROCOPY:
  1455			if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
  1456				if (!(sk_is_tcp(sk) ||
  1457				      (sk->sk_type == SOCK_DGRAM &&
  1458				       sk->sk_protocol == IPPROTO_UDP)))
  1459					ret = -EOPNOTSUPP;
  1460			} else if (sk->sk_family != PF_RDS) {
  1461				ret = -EOPNOTSUPP;
  1462			}
  1463			if (!ret) {
  1464				if (val < 0 || val > 1)
  1465					ret = -EINVAL;
  1466				else
  1467					sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
  1468			}
  1469			break;
  1470	
  1471		case SO_TXTIME:
  1472			if (optlen != sizeof(struct sock_txtime)) {
  1473				ret = -EINVAL;
  1474				break;
  1475			} else if (copy_from_sockptr(&sk_txtime, optval,
  1476				   sizeof(struct sock_txtime))) {
  1477				ret = -EFAULT;
  1478				break;
  1479			} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
  1480				ret = -EINVAL;
  1481				break;
  1482			}
  1483			/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
  1484			 * scheduler has enough safe guards.
  1485			 */
  1486			if (sk_txtime.clockid != CLOCK_MONOTONIC &&
  1487			    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
  1488				ret = -EPERM;
  1489				break;
  1490			}
  1491			sock_valbool_flag(sk, SOCK_TXTIME, true);
  1492			sk->sk_clockid = sk_txtime.clockid;
  1493			sk->sk_txtime_deadline_mode =
  1494				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
  1495			sk->sk_txtime_report_errors =
  1496				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
  1497			break;
  1498	
  1499		case SO_BINDTOIFINDEX:
  1500			ret = sock_bindtoindex_locked(sk, val);
  1501			break;
  1502	
  1503		case SO_BUF_LOCK:
  1504			if (val & ~SOCK_BUF_LOCK_MASK) {
  1505				ret = -EINVAL;
  1506				break;
  1507			}
  1508			sk->sk_userlocks = val | (sk->sk_userlocks &
  1509						  ~SOCK_BUF_LOCK_MASK);
  1510			break;
  1511	
  1512		case SO_RESERVE_MEM:
  1513		{
  1514			int delta;
  1515	
  1516			if (val < 0) {
  1517				ret = -EINVAL;
  1518				break;
  1519			}
  1520	
  1521			delta = val - sk->sk_reserved_mem;
  1522			if (delta < 0)
  1523				sock_release_reserved_memory(sk, -delta);
  1524			else
  1525				ret = sock_reserve_memory(sk, delta);
  1526			break;
  1527		}
  1528	
  1529		case SO_TXREHASH:
  1530			if (val < -1 || val > 1) {
  1531				ret = -EINVAL;
  1532				break;
  1533			}
  1534			/* Paired with READ_ONCE() in tcp_rtx_synack() */
  1535			WRITE_ONCE(sk->sk_txrehash, (u8)val);
  1536			break;
  1537	
> 1538		case SO_SETNETNS:
  1539		{
  1540			struct net *other_ns, *my_ns;
  1541	
  1542			if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
  1543				ret = -EOPNOTSUPP;
  1544				break;
  1545			}
  1546	
  1547			if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
  1548				ret = -EOPNOTSUPP;
  1549				break;
  1550			}
  1551	
  1552			other_ns = get_net_ns_by_fd(val);
  1553			if (IS_ERR(other_ns)) {
  1554				ret = PTR_ERR(other_ns);
  1555				break;
  1556			}
  1557	
  1558			if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
  1559				ret = -EPERM;
  1560				goto out_err;
  1561			}
  1562	
  1563			/* check that the socket has never been connected or recently disconnected */
  1564			if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
  1565				ret = -EOPNOTSUPP;
  1566				goto out_err;
  1567			}
  1568	
  1569			/* check that the socket is not bound to an interface*/
  1570			if (sk->sk_bound_dev_if != 0) {
  1571				ret = -EOPNOTSUPP;
  1572				goto out_err;
  1573			}
  1574	
  1575			my_ns = sock_net(sk);
  1576			sock_net_set(sk, other_ns);
  1577			put_net(my_ns);
  1578			break;
  1579	out_err:
  1580			put_net(other_ns);
  1581			break;
  1582		}
  1583	
  1584		default:
  1585			ret = -ENOPROTOOPT;
  1586			break;
  1587		}
  1588		sockopt_release_sock(sk);
  1589		return ret;
  1590	}
  1591	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC] net: add new socket option SO_SETNETNS
  2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
  2023-02-02  1:48 ` Hillf Danton
  2023-02-07 11:48 ` kernel test robot
@ 2023-02-07 14:21 ` kernel test robot
  2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2023-02-07 14:21 UTC (permalink / raw)
  To: aloktiagi; +Cc: oe-kbuild-all

Hi aloktiagi,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes net/master linus/master v6.2-rc7]
[cannot apply to net-next/master next-20230207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link:    https://lore.kernel.org/r/Y9q8Ec1CJILZz7dj%40ip-172-31-38-16.us-west-2.compute.internal
patch subject: [RFC] net: add new socket option SO_SETNETNS
config: mips-gcw0_defconfig (https://download.01.org/0day-ci/archive/20230207/202302072207.wLeEHX68-lkp@intel.com/config)
compiler: mipsel-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/03eff302351f4db1ed733d7b303f3938e511413b
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
        git checkout 03eff302351f4db1ed733d7b303f3938e511413b
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/core/sock.c: In function 'sk_setsockopt':
>> net/core/sock.c:1538:14: error: 'SO_SETNETNS' undeclared (first use in this function)
    1538 |         case SO_SETNETNS:
         |              ^~~~~~~~~~~
   net/core/sock.c:1538:14: note: each undeclared identifier is reported only once for each function it appears in


vim +/SO_SETNETNS +1538 net/core/sock.c

  1426	
  1427		case SO_MAX_PACING_RATE:
  1428			{
  1429			unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
  1430	
  1431			if (sizeof(ulval) != sizeof(val) &&
  1432			    optlen >= sizeof(ulval) &&
  1433			    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
  1434				ret = -EFAULT;
  1435				break;
  1436			}
  1437			if (ulval != ~0UL)
  1438				cmpxchg(&sk->sk_pacing_status,
  1439					SK_PACING_NONE,
  1440					SK_PACING_NEEDED);
  1441			sk->sk_max_pacing_rate = ulval;
  1442			sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
  1443			break;
  1444			}
  1445		case SO_INCOMING_CPU:
  1446			reuseport_update_incoming_cpu(sk, val);
  1447			break;
  1448	
  1449		case SO_CNX_ADVICE:
  1450			if (val == 1)
  1451				dst_negative_advice(sk);
  1452			break;
  1453	
  1454		case SO_ZEROCOPY:
  1455			if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
  1456				if (!(sk_is_tcp(sk) ||
  1457				      (sk->sk_type == SOCK_DGRAM &&
  1458				       sk->sk_protocol == IPPROTO_UDP)))
  1459					ret = -EOPNOTSUPP;
  1460			} else if (sk->sk_family != PF_RDS) {
  1461				ret = -EOPNOTSUPP;
  1462			}
  1463			if (!ret) {
  1464				if (val < 0 || val > 1)
  1465					ret = -EINVAL;
  1466				else
  1467					sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
  1468			}
  1469			break;
  1470	
  1471		case SO_TXTIME:
  1472			if (optlen != sizeof(struct sock_txtime)) {
  1473				ret = -EINVAL;
  1474				break;
  1475			} else if (copy_from_sockptr(&sk_txtime, optval,
  1476				   sizeof(struct sock_txtime))) {
  1477				ret = -EFAULT;
  1478				break;
  1479			} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
  1480				ret = -EINVAL;
  1481				break;
  1482			}
  1483			/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
  1484			 * scheduler has enough safe guards.
  1485			 */
  1486			if (sk_txtime.clockid != CLOCK_MONOTONIC &&
  1487			    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
  1488				ret = -EPERM;
  1489				break;
  1490			}
  1491			sock_valbool_flag(sk, SOCK_TXTIME, true);
  1492			sk->sk_clockid = sk_txtime.clockid;
  1493			sk->sk_txtime_deadline_mode =
  1494				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
  1495			sk->sk_txtime_report_errors =
  1496				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
  1497			break;
  1498	
  1499		case SO_BINDTOIFINDEX:
  1500			ret = sock_bindtoindex_locked(sk, val);
  1501			break;
  1502	
  1503		case SO_BUF_LOCK:
  1504			if (val & ~SOCK_BUF_LOCK_MASK) {
  1505				ret = -EINVAL;
  1506				break;
  1507			}
  1508			sk->sk_userlocks = val | (sk->sk_userlocks &
  1509						  ~SOCK_BUF_LOCK_MASK);
  1510			break;
  1511	
  1512		case SO_RESERVE_MEM:
  1513		{
  1514			int delta;
  1515	
  1516			if (val < 0) {
  1517				ret = -EINVAL;
  1518				break;
  1519			}
  1520	
  1521			delta = val - sk->sk_reserved_mem;
  1522			if (delta < 0)
  1523				sock_release_reserved_memory(sk, -delta);
  1524			else
  1525				ret = sock_reserve_memory(sk, delta);
  1526			break;
  1527		}
  1528	
  1529		case SO_TXREHASH:
  1530			if (val < -1 || val > 1) {
  1531				ret = -EINVAL;
  1532				break;
  1533			}
  1534			/* Paired with READ_ONCE() in tcp_rtx_synack() */
  1535			WRITE_ONCE(sk->sk_txrehash, (u8)val);
  1536			break;
  1537	
> 1538		case SO_SETNETNS:
  1539		{
  1540			struct net *other_ns, *my_ns;
  1541	
  1542			if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
  1543				ret = -EOPNOTSUPP;
  1544				break;
  1545			}
  1546	
  1547			if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
  1548				ret = -EOPNOTSUPP;
  1549				break;
  1550			}
  1551	
  1552			other_ns = get_net_ns_by_fd(val);
  1553			if (IS_ERR(other_ns)) {
  1554				ret = PTR_ERR(other_ns);
  1555				break;
  1556			}
  1557	
  1558			if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
  1559				ret = -EPERM;
  1560				goto out_err;
  1561			}
  1562	
  1563			/* check that the socket has never been connected or recently disconnected */
  1564			if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
  1565				ret = -EOPNOTSUPP;
  1566				goto out_err;
  1567			}
  1568	
  1569			/* check that the socket is not bound to an interface*/
  1570			if (sk->sk_bound_dev_if != 0) {
  1571				ret = -EOPNOTSUPP;
  1572				goto out_err;
  1573			}
  1574	
  1575			my_ns = sock_net(sk);
  1576			sock_net_set(sk, other_ns);
  1577			put_net(my_ns);
  1578			break;
  1579	out_err:
  1580			put_net(other_ns);
  1581			break;
  1582		}
  1583	
  1584		default:
  1585			ret = -ENOPROTOOPT;
  1586			break;
  1587		}
  1588		sockopt_release_sock(sk);
  1589		return ret;
  1590	}
  1591	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-02-07 14:22 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
2023-02-02  1:48 ` Hillf Danton
2023-02-02 19:55   ` Alok Tiagi
2023-02-02 20:10     ` Eric Dumazet
2023-02-02 23:58       ` Alok Tiagi
2023-02-03 15:09         ` Eric Dumazet
2023-02-03 17:50           ` Alok Tiagi
2023-02-03 21:17             ` Eric W. Biederman
2023-02-04 18:44               ` Alok Tiagi
2023-02-07 11:48 ` kernel test robot
2023-02-07 14:21 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.