* [RFC] net: add new socket option SO_SETNETNS
@ 2023-02-01 19:22 aloktiagi
2023-02-02 1:48 ` Hillf Danton
` (2 more replies)
0 siblings, 3 replies; 11+ messages in thread
From: aloktiagi @ 2023-02-01 19:22 UTC (permalink / raw)
To: ebiederm
Cc: davem, edumazet, kuba, pabeni, netdev, linux-kernel, tycho,
sargun, aloktiagi
This socket option provides a mechanism for users to switch a sockets network
namespace. This enables use cases where multiple IPv6 only network namespaces
can use a single IPv4 network namespace for IPv4 only egress connectivity by
switching their sockets from IPv6 to IPv4 network namespace. This allows for
migration of systems to IPv6 only while keeping their connectivity to IPv4 only
destinations intact.
Today, we achieve this by setting up seccomp filter to intercept network system
calls like connect() from a container in a container manager which runs in an
IPv4 only network namespace. The container manager creates a new IPv4 connection
and injects the new file descriptor through SECCOMP_NOTIFY_IOCTL_ADDFD replacing
the original file descriptor from the connect() call. This does not work for
cases where the original file descriptor is handed off to a system like epoll
before the connect() call. After a new file descriptor is injected the original
file descriptor being referenced by the epoll fd is not longer valid leading to
failures. As a workaround the container manager when intercepting connect()
loops through all open socket file descriptors to check if they are referencing
the socket attempting the connect() and replace the reference with the to be
injected file descriptor. This workaround is cumbersome and makes the solution
prone to similar yet to be discovered issues.
With SO_SETNETNS, the container manager can simply switch the original
unconnected socket’s network namespace to the IPv4 only network namespace
without the need for injecting any new socket. The container can then proceed
with the connect() call and establish connectivity to the IPv4 only destination.
This socket option is only allowed for sockets that have never been connected
since connected or recently disconnected sockets maybe bound to their network
namespaces network device and switching their namespace may lead to undefined
behavior.
Signed-off-by: aloktiagi <aloktiagi@gmail.com>
---
include/uapi/asm-generic/socket.h | 2 +
net/core/sock.c | 46 +++++
tools/testing/selftests/net/Makefile | 1 +
tools/testing/selftests/net/so_set_netns.c | 208 +++++++++++++++++++++
4 files changed, 257 insertions(+)
create mode 100644 tools/testing/selftests/net/so_set_netns.c
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..dc9498233fe5 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,8 @@
#define SO_RCVMARK 75
+#define SO_SETNETNS 76
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/sock.c b/net/core/sock.c
index f954d5893e79..34cb72b211a6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
+ case SO_SETNETNS:
+ {
+ struct net *other_ns, *my_ns;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ other_ns = get_net_ns_by_fd(val);
+ if (IS_ERR(other_ns)) {
+ ret = PTR_ERR(other_ns);
+ break;
+ }
+
+ if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto out_err;
+ }
+
+ /* check that the socket has never been connected or recently disconnected */
+ if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ /* check that the socket is not bound to an interface*/
+ if (sk->sk_bound_dev_if != 0) {
+ ret = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ my_ns = sock_net(sk);
+ sock_net_set(sk, other_ns);
+ put_net(my_ns);
+ break;
+out_err:
+ put_net(other_ns);
+ break;
+ }
+
default:
ret = -ENOPROTOOPT;
break;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3007e98a6d64..c2e7679e31bb 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -75,6 +75,7 @@ TEST_GEN_PROGS += so_incoming_cpu
TEST_PROGS += sctp_vrf.sh
TEST_GEN_FILES += sctp_hello
TEST_GEN_FILES += csum
+TEST_GEN_PROGS += so_set_netns
TEST_FILES := settings
diff --git a/tools/testing/selftests/net/so_set_netns.c b/tools/testing/selftests/net/so_set_netns.c
new file mode 100644
index 000000000000..cc7767d23a5d
--- /dev/null
+++ b/tools/testing/selftests/net/so_set_netns.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <linux/tcp.h>
+#include <linux/socket.h>
+
+#include <sys/types.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef SO_SETNETNS
+#define SO_SETNETNS 76
+#endif
+
+static int unshare_open(void)
+{
+ const char *netns_path = "/proc/self/ns/net";
+ int fd, ret;
+
+ if (unshare(CLONE_NEWNET) != 0)
+ return -1;
+
+ fd = open(netns_path, O_RDONLY);
+ if (fd <= 0)
+ return -1;
+
+ ret = system("ip link set lo up");
+ if (ret < 0)
+ return -1;
+
+ return fd;
+}
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWNET))
+ return -1;
+ return 0;
+}
+
+static void init_namespaces(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server)
+{
+ *netns_client = unshare_open();
+ ASSERT_GE(*netns_client, 0);
+
+ *netns_server = unshare_open();
+ ASSERT_GE(*netns_server, 0);
+}
+
+static void setup_network(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server)
+{
+ int ret;
+
+ ret = switch_ns(*netns_client);
+ ASSERT_EQ(ret, 0);
+
+ ret = system("ip addr add fd::1/64 dev lo");
+ ASSERT_EQ(ret, 0);
+
+ ret = switch_ns(*netns_server);
+ ASSERT_EQ(ret, 0);
+
+ ret = system("ip addr add 192.168.1.1/24 dev lo");
+ ASSERT_EQ(ret, 0);
+}
+
+static void setup_client_server(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server,
+ int *client_fd, int *server_fd)
+{
+ struct sockaddr_in addr;
+ int ret;
+
+ ret = switch_ns(*netns_client);
+ ASSERT_EQ(ret, 0);
+
+ *client_fd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = switch_ns(*netns_server);
+ ASSERT_EQ(ret, 0);
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ *server_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ret = bind(*server_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(*server_fd, 10);
+ ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(so_set_netns)
+{
+ int netns_client, netns_server;
+ int client_fd, server_fd;
+};
+
+FIXTURE_SETUP(so_set_netns)
+{
+ init_namespaces(_metadata, &self->netns_client, &self->netns_server);
+ setup_network(_metadata, &self->netns_client, &self->netns_server);
+ setup_client_server(_metadata,
+ &self->netns_client, &self->netns_server,
+ &self->client_fd, &self->server_fd);
+}
+
+FIXTURE_TEARDOWN(so_set_netns)
+{
+ close(self->client_fd);
+ close(self->server_fd);
+ close(self->netns_client);
+ close(self->netns_server);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_unconnected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = switch_ns(self->netns_client);
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_connected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ // switching network namespace of connected
+ // socket should fail
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_client,
+ sizeof(self->netns_client));
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_disconnected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ close(self->server_fd);
+
+ // switching network namespace of recently disconnected
+ // socket should fail
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_client,
+ sizeof(self->netns_client));
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_HARNESS_MAIN
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
@ 2023-02-02 1:48 ` Hillf Danton
2023-02-02 19:55 ` Alok Tiagi
2023-02-07 11:48 ` kernel test robot
2023-02-07 14:21 ` kernel test robot
2 siblings, 1 reply; 11+ messages in thread
From: Hillf Danton @ 2023-02-02 1:48 UTC (permalink / raw)
To: aloktiagi; +Cc: ebiederm, edumazet, netdev, linux-mm, linux-kernel
On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> WRITE_ONCE(sk->sk_txrehash, (u8)val);
> break;
>
> + case SO_SETNETNS:
> + {
> + struct net *other_ns, *my_ns;
> +
> + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> + ret = -EOPNOTSUPP;
> + break;
> + }
> +
> + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> + ret = -EOPNOTSUPP;
> + break;
> + }
> +
> + other_ns = get_net_ns_by_fd(val);
> + if (IS_ERR(other_ns)) {
> + ret = PTR_ERR(other_ns);
> + break;
> + }
> +
> + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> + ret = -EPERM;
> + goto out_err;
> + }
> +
> + /* check that the socket has never been connected or recently disconnected */
> + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> + ret = -EOPNOTSUPP;
> + goto out_err;
> + }
> +
> + /* check that the socket is not bound to an interface*/
> + if (sk->sk_bound_dev_if != 0) {
> + ret = -EOPNOTSUPP;
> + goto out_err;
> + }
> +
> + my_ns = sock_net(sk);
> + sock_net_set(sk, other_ns);
> + put_net(my_ns);
> + break;
cpu 0 cpu 2
--- ---
ns = sock_net(sk);
my_ns = sock_net(sk);
sock_net_set(sk, other_ns);
put_net(my_ns);
ns is invalid ?
> +out_err:
> + put_net(other_ns);
> + break;
> + }
> +
> default:
> ret = -ENOPROTOOPT;
> break;
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-02 1:48 ` Hillf Danton
@ 2023-02-02 19:55 ` Alok Tiagi
2023-02-02 20:10 ` Eric Dumazet
0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-02 19:55 UTC (permalink / raw)
To: Hillf Danton; +Cc: ebiederm, edumazet, netdev, linux-mm, linux-kernel
On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > break;
> >
> > + case SO_SETNETNS:
> > + {
> > + struct net *other_ns, *my_ns;
> > +
> > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > + ret = -EOPNOTSUPP;
> > + break;
> > + }
> > +
> > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > + ret = -EOPNOTSUPP;
> > + break;
> > + }
> > +
> > + other_ns = get_net_ns_by_fd(val);
> > + if (IS_ERR(other_ns)) {
> > + ret = PTR_ERR(other_ns);
> > + break;
> > + }
> > +
> > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > + ret = -EPERM;
> > + goto out_err;
> > + }
> > +
> > + /* check that the socket has never been connected or recently disconnected */
> > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > + ret = -EOPNOTSUPP;
> > + goto out_err;
> > + }
> > +
> > + /* check that the socket is not bound to an interface*/
> > + if (sk->sk_bound_dev_if != 0) {
> > + ret = -EOPNOTSUPP;
> > + goto out_err;
> > + }
> > +
> > + my_ns = sock_net(sk);
> > + sock_net_set(sk, other_ns);
> > + put_net(my_ns);
> > + break;
>
> cpu 0 cpu 2
> --- ---
> ns = sock_net(sk);
> my_ns = sock_net(sk);
> sock_net_set(sk, other_ns);
> put_net(my_ns);
> ns is invalid ?
That is the reason we want the socket to be in an un-connected state. That
should help us avoid this situation.
>
> > +out_err:
> > + put_net(other_ns);
> > + break;
> > + }
> > +
> > default:
> > ret = -ENOPROTOOPT;
> > break;
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-02 19:55 ` Alok Tiagi
@ 2023-02-02 20:10 ` Eric Dumazet
2023-02-02 23:58 ` Alok Tiagi
0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2023-02-02 20:10 UTC (permalink / raw)
To: Alok Tiagi; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel
On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
>
> On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > break;
> > >
> > > + case SO_SETNETNS:
> > > + {
> > > + struct net *other_ns, *my_ns;
> > > +
> > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > + ret = -EOPNOTSUPP;
> > > + break;
> > > + }
> > > +
> > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > + ret = -EOPNOTSUPP;
> > > + break;
> > > + }
> > > +
> > > + other_ns = get_net_ns_by_fd(val);
> > > + if (IS_ERR(other_ns)) {
> > > + ret = PTR_ERR(other_ns);
> > > + break;
> > > + }
> > > +
> > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > + ret = -EPERM;
> > > + goto out_err;
> > > + }
> > > +
> > > + /* check that the socket has never been connected or recently disconnected */
> > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > + ret = -EOPNOTSUPP;
> > > + goto out_err;
> > > + }
> > > +
> > > + /* check that the socket is not bound to an interface*/
> > > + if (sk->sk_bound_dev_if != 0) {
> > > + ret = -EOPNOTSUPP;
> > > + goto out_err;
> > > + }
> > > +
> > > + my_ns = sock_net(sk);
> > > + sock_net_set(sk, other_ns);
> > > + put_net(my_ns);
> > > + break;
> >
> > cpu 0 cpu 2
> > --- ---
> > ns = sock_net(sk);
> > my_ns = sock_net(sk);
> > sock_net_set(sk, other_ns);
> > put_net(my_ns);
> > ns is invalid ?
>
> That is the reason we want the socket to be in an un-connected state. That
> should help us avoid this situation.
This is not enough....
Another thread might look at sock_net(sk), for example from inet_diag
or tcp timers
(which can be fired even in un-connected state)
Even UDP sockets can receive packets while being un-connected,
and they need to deref the net pointer.
Currently there is no protection about sock_net(sk) being changed on the fly,
and the struct net could disappear and be freed.
There are ~1500 uses of sock_net(sk) in the kernel, I do not think
you/we want to audit all
of them to check what could go wrong...
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-02 20:10 ` Eric Dumazet
@ 2023-02-02 23:58 ` Alok Tiagi
2023-02-03 15:09 ` Eric Dumazet
0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-02 23:58 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel
On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >
> > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > > break;
> > > >
> > > > + case SO_SETNETNS:
> > > > + {
> > > > + struct net *other_ns, *my_ns;
> > > > +
> > > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > + ret = -EOPNOTSUPP;
> > > > + break;
> > > > + }
> > > > +
> > > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > + ret = -EOPNOTSUPP;
> > > > + break;
> > > > + }
> > > > +
> > > > + other_ns = get_net_ns_by_fd(val);
> > > > + if (IS_ERR(other_ns)) {
> > > > + ret = PTR_ERR(other_ns);
> > > > + break;
> > > > + }
> > > > +
> > > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > + ret = -EPERM;
> > > > + goto out_err;
> > > > + }
> > > > +
> > > > + /* check that the socket has never been connected or recently disconnected */
> > > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > + ret = -EOPNOTSUPP;
> > > > + goto out_err;
> > > > + }
> > > > +
> > > > + /* check that the socket is not bound to an interface*/
> > > > + if (sk->sk_bound_dev_if != 0) {
> > > > + ret = -EOPNOTSUPP;
> > > > + goto out_err;
> > > > + }
> > > > +
> > > > + my_ns = sock_net(sk);
> > > > + sock_net_set(sk, other_ns);
> > > > + put_net(my_ns);
> > > > + break;
> > >
> > > cpu 0 cpu 2
> > > --- ---
> > > ns = sock_net(sk);
> > > my_ns = sock_net(sk);
> > > sock_net_set(sk, other_ns);
> > > put_net(my_ns);
> > > ns is invalid ?
> >
> > That is the reason we want the socket to be in an un-connected state. That
> > should help us avoid this situation.
>
> This is not enough....
>
> Another thread might look at sock_net(sk), for example from inet_diag
> or tcp timers
> (which can be fired even in un-connected state)
>
> Even UDP sockets can receive packets while being un-connected,
> and they need to deref the net pointer.
>
> Currently there is no protection about sock_net(sk) being changed on the fly,
> and the struct net could disappear and be freed.
>
> There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> you/we want to audit all
> of them to check what could go wrong...
I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
exploration of the usage of sock_net(sk) it appeared that it might be safe to
swap a sockets net ns if it had never been connected but I looked at only a
subset of such uses.
Introducing a ref counting logic to every access of sock_net(sk) may help get
around this but invovles a bigger change to increment and decrement the count at
every use of sock_net().
Any suggestions if this could be achieved in another way much close to the
socket creation time or any comments on our workaround for injecting sockets using
seccomp addfd?
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-02 23:58 ` Alok Tiagi
@ 2023-02-03 15:09 ` Eric Dumazet
2023-02-03 17:50 ` Alok Tiagi
0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2023-02-03 15:09 UTC (permalink / raw)
To: Alok Tiagi; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel
On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
>
> On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> > >
> > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > > > break;
> > > > >
> > > > > + case SO_SETNETNS:
> > > > > + {
> > > > > + struct net *other_ns, *my_ns;
> > > > > +
> > > > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > > + ret = -EOPNOTSUPP;
> > > > > + break;
> > > > > + }
> > > > > +
> > > > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > > + ret = -EOPNOTSUPP;
> > > > > + break;
> > > > > + }
> > > > > +
> > > > > + other_ns = get_net_ns_by_fd(val);
> > > > > + if (IS_ERR(other_ns)) {
> > > > > + ret = PTR_ERR(other_ns);
> > > > > + break;
> > > > > + }
> > > > > +
> > > > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > > + ret = -EPERM;
> > > > > + goto out_err;
> > > > > + }
> > > > > +
> > > > > + /* check that the socket has never been connected or recently disconnected */
> > > > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > > + ret = -EOPNOTSUPP;
> > > > > + goto out_err;
> > > > > + }
> > > > > +
> > > > > + /* check that the socket is not bound to an interface*/
> > > > > + if (sk->sk_bound_dev_if != 0) {
> > > > > + ret = -EOPNOTSUPP;
> > > > > + goto out_err;
> > > > > + }
> > > > > +
> > > > > + my_ns = sock_net(sk);
> > > > > + sock_net_set(sk, other_ns);
> > > > > + put_net(my_ns);
> > > > > + break;
> > > >
> > > > cpu 0 cpu 2
> > > > --- ---
> > > > ns = sock_net(sk);
> > > > my_ns = sock_net(sk);
> > > > sock_net_set(sk, other_ns);
> > > > put_net(my_ns);
> > > > ns is invalid ?
> > >
> > > That is the reason we want the socket to be in an un-connected state. That
> > > should help us avoid this situation.
> >
> > This is not enough....
> >
> > Another thread might look at sock_net(sk), for example from inet_diag
> > or tcp timers
> > (which can be fired even in un-connected state)
> >
> > Even UDP sockets can receive packets while being un-connected,
> > and they need to deref the net pointer.
> >
> > Currently there is no protection about sock_net(sk) being changed on the fly,
> > and the struct net could disappear and be freed.
> >
> > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> > you/we want to audit all
> > of them to check what could go wrong...
>
> I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> exploration of the usage of sock_net(sk) it appeared that it might be safe to
> swap a sockets net ns if it had never been connected but I looked at only a
> subset of such uses.
>
> Introducing a ref counting logic to every access of sock_net(sk) may help get
> around this but invovles a bigger change to increment and decrement the count at
> every use of sock_net().
>
> Any suggestions if this could be achieved in another way much close to the
> socket creation time or any comments on our workaround for injecting sockets using
> seccomp addfd?
Maybe the existing BPF hook in inet_create() could be used ?
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
The BPF program might be able to switch the netns, because at this
time the new socket is not
yet visible from external threads.
Although it is not going to catch dual stack uses (open a V6 socket,
then use a v4mapped address at bind()/connect()/...
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-03 15:09 ` Eric Dumazet
@ 2023-02-03 17:50 ` Alok Tiagi
2023-02-03 21:17 ` Eric W. Biederman
0 siblings, 1 reply; 11+ messages in thread
From: Alok Tiagi @ 2023-02-03 17:50 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Hillf Danton, ebiederm, netdev, linux-mm, linux-kernel
On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >
> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> > > >
> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> > > > > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> > > > > > break;
> > > > > >
> > > > > > + case SO_SETNETNS:
> > > > > > + {
> > > > > > + struct net *other_ns, *my_ns;
> > > > > > +
> > > > > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> > > > > > + ret = -EOPNOTSUPP;
> > > > > > + break;
> > > > > > + }
> > > > > > +
> > > > > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> > > > > > + ret = -EOPNOTSUPP;
> > > > > > + break;
> > > > > > + }
> > > > > > +
> > > > > > + other_ns = get_net_ns_by_fd(val);
> > > > > > + if (IS_ERR(other_ns)) {
> > > > > > + ret = PTR_ERR(other_ns);
> > > > > > + break;
> > > > > > + }
> > > > > > +
> > > > > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> > > > > > + ret = -EPERM;
> > > > > > + goto out_err;
> > > > > > + }
> > > > > > +
> > > > > > + /* check that the socket has never been connected or recently disconnected */
> > > > > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> > > > > > + ret = -EOPNOTSUPP;
> > > > > > + goto out_err;
> > > > > > + }
> > > > > > +
> > > > > > + /* check that the socket is not bound to an interface*/
> > > > > > + if (sk->sk_bound_dev_if != 0) {
> > > > > > + ret = -EOPNOTSUPP;
> > > > > > + goto out_err;
> > > > > > + }
> > > > > > +
> > > > > > + my_ns = sock_net(sk);
> > > > > > + sock_net_set(sk, other_ns);
> > > > > > + put_net(my_ns);
> > > > > > + break;
> > > > >
> > > > > cpu 0 cpu 2
> > > > > --- ---
> > > > > ns = sock_net(sk);
> > > > > my_ns = sock_net(sk);
> > > > > sock_net_set(sk, other_ns);
> > > > > put_net(my_ns);
> > > > > ns is invalid ?
> > > >
> > > > That is the reason we want the socket to be in an un-connected state. That
> > > > should help us avoid this situation.
> > >
> > > This is not enough....
> > >
> > > Another thread might look at sock_net(sk), for example from inet_diag
> > > or tcp timers
> > > (which can be fired even in un-connected state)
> > >
> > > Even UDP sockets can receive packets while being un-connected,
> > > and they need to deref the net pointer.
> > >
> > > Currently there is no protection about sock_net(sk) being changed on the fly,
> > > and the struct net could disappear and be freed.
> > >
> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> > > you/we want to audit all
> > > of them to check what could go wrong...
> >
> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
> > swap a sockets net ns if it had never been connected but I looked at only a
> > subset of such uses.
> >
> > Introducing a ref counting logic to every access of sock_net(sk) may help get
> > around this but invovles a bigger change to increment and decrement the count at
> > every use of sock_net().
> >
> > Any suggestions if this could be achieved in another way much close to the
> > socket creation time or any comments on our workaround for injecting sockets using
> > seccomp addfd?
>
> Maybe the existing BPF hook in inet_create() could be used ?
>
> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
>
> The BPF program might be able to switch the netns, because at this
> time the new socket is not
> yet visible from external threads.
>
> Although it is not going to catch dual stack uses (open a V6 socket,
> then use a v4mapped address at bind()/connect()/...
We thought of a similar approach by intercepting the socket() call in seccomp
and injecting a new file descritpor much earlier but as you said we run into the
issue of handling dual stack sockets since we do not know in advance if its
going to be used for a v4mapped address.
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-03 17:50 ` Alok Tiagi
@ 2023-02-03 21:17 ` Eric W. Biederman
2023-02-04 18:44 ` Alok Tiagi
0 siblings, 1 reply; 11+ messages in thread
From: Eric W. Biederman @ 2023-02-03 21:17 UTC (permalink / raw)
To: Alok Tiagi; +Cc: Eric Dumazet, Hillf Danton, netdev, linux-mm, linux-kernel
Alok Tiagi <aloktiagi@gmail.com> writes:
> On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
>> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
>> >
>> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
>> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
>> > > >
>> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
>> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
>> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
>> > > > > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
>> > > > > > break;
>> > > > > >
>> > > > > > + case SO_SETNETNS:
>> > > > > > + {
>> > > > > > + struct net *other_ns, *my_ns;
>> > > > > > +
>> > > > > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
>> > > > > > + ret = -EOPNOTSUPP;
>> > > > > > + break;
>> > > > > > + }
>> > > > > > +
>> > > > > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
>> > > > > > + ret = -EOPNOTSUPP;
>> > > > > > + break;
>> > > > > > + }
>> > > > > > +
>> > > > > > + other_ns = get_net_ns_by_fd(val);
>> > > > > > + if (IS_ERR(other_ns)) {
>> > > > > > + ret = PTR_ERR(other_ns);
>> > > > > > + break;
>> > > > > > + }
>> > > > > > +
>> > > > > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
>> > > > > > + ret = -EPERM;
>> > > > > > + goto out_err;
>> > > > > > + }
>> > > > > > +
>> > > > > > + /* check that the socket has never been connected or recently disconnected */
>> > > > > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
>> > > > > > + ret = -EOPNOTSUPP;
>> > > > > > + goto out_err;
>> > > > > > + }
>> > > > > > +
>> > > > > > + /* check that the socket is not bound to an interface*/
>> > > > > > + if (sk->sk_bound_dev_if != 0) {
>> > > > > > + ret = -EOPNOTSUPP;
>> > > > > > + goto out_err;
>> > > > > > + }
>> > > > > > +
>> > > > > > + my_ns = sock_net(sk);
>> > > > > > + sock_net_set(sk, other_ns);
>> > > > > > + put_net(my_ns);
>> > > > > > + break;
>> > > > >
>> > > > > cpu 0 cpu 2
>> > > > > --- ---
>> > > > > ns = sock_net(sk);
>> > > > > my_ns = sock_net(sk);
>> > > > > sock_net_set(sk, other_ns);
>> > > > > put_net(my_ns);
>> > > > > ns is invalid ?
>> > > >
>> > > > That is the reason we want the socket to be in an un-connected state. That
>> > > > should help us avoid this situation.
>> > >
>> > > This is not enough....
>> > >
>> > > Another thread might look at sock_net(sk), for example from inet_diag
>> > > or tcp timers
>> > > (which can be fired even in un-connected state)
>> > >
>> > > Even UDP sockets can receive packets while being un-connected,
>> > > and they need to deref the net pointer.
>> > >
>> > > Currently there is no protection about sock_net(sk) being changed on the fly,
>> > > and the struct net could disappear and be freed.
>> > >
>> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
>> > > you/we want to audit all
>> > > of them to check what could go wrong...
>> >
>> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
>> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
>> > swap a sockets net ns if it had never been connected but I looked at only a
>> > subset of such uses.
>> >
>> > Introducing a ref counting logic to every access of sock_net(sk) may help get
>> > around this but invovles a bigger change to increment and decrement the count at
>> > every use of sock_net().
>> >
>> > Any suggestions if this could be achieved in another way much close to the
>> > socket creation time or any comments on our workaround for injecting sockets using
>> > seccomp addfd?
>>
>> Maybe the existing BPF hook in inet_create() could be used ?
>>
>> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
>>
>> The BPF program might be able to switch the netns, because at this
>> time the new socket is not
>> yet visible from external threads.
>>
>> Although it is not going to catch dual stack uses (open a V6 socket,
>> then use a v4mapped address at bind()/connect()/...
>
> We thought of a similar approach by intercepting the socket() call in seccomp
> and injecting a new file descritpor much earlier but as you said we run into the
> issue of handling dual stack sockets since we do not know in advance if its
> going to be used for a v4mapped address.
I would suggest adding a default ipv4 route from your ipv6 network
namespaces to your ipv4 network namespace, but that only works for
outbound traffic. The inbound traffic problem is classically solved
via nat.
That you are not suggesting using nat has me thinking there is something
subtle in what you are trying to do that I am missing.
Perhaps your userspace can do:
previous_netns = open("/proc/self/ns/net");
setns(ipv4_netns);
socket();
setns(previous_netns);
As the network namespace is per thread this is atomic if you add
the logic to block signals around it.
Eric
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-03 21:17 ` Eric W. Biederman
@ 2023-02-04 18:44 ` Alok Tiagi
0 siblings, 0 replies; 11+ messages in thread
From: Alok Tiagi @ 2023-02-04 18:44 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Eric Dumazet, Hillf Danton, netdev, linux-mm, linux-kernel, tycho
On Fri, Feb 03, 2023 at 03:17:06PM -0600, Eric W. Biederman wrote:
> Alok Tiagi <aloktiagi@gmail.com> writes:
>
> > On Fri, Feb 03, 2023 at 04:09:12PM +0100, Eric Dumazet wrote:
> >> On Fri, Feb 3, 2023 at 12:59 AM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >> >
> >> > On Thu, Feb 02, 2023 at 09:10:23PM +0100, Eric Dumazet wrote:
> >> > > On Thu, Feb 2, 2023 at 8:55 PM Alok Tiagi <aloktiagi@gmail.com> wrote:
> >> > > >
> >> > > > On Thu, Feb 02, 2023 at 09:48:10AM +0800, Hillf Danton wrote:
> >> > > > > On Wed, 1 Feb 2023 19:22:57 +0000 aloktiagi <aloktiagi@gmail.com>
> >> > > > > > @@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
> >> > > > > > WRITE_ONCE(sk->sk_txrehash, (u8)val);
> >> > > > > > break;
> >> > > > > >
> >> > > > > > + case SO_SETNETNS:
> >> > > > > > + {
> >> > > > > > + struct net *other_ns, *my_ns;
> >> > > > > > +
> >> > > > > > + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
> >> > > > > > + ret = -EOPNOTSUPP;
> >> > > > > > + break;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
> >> > > > > > + ret = -EOPNOTSUPP;
> >> > > > > > + break;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + other_ns = get_net_ns_by_fd(val);
> >> > > > > > + if (IS_ERR(other_ns)) {
> >> > > > > > + ret = PTR_ERR(other_ns);
> >> > > > > > + break;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
> >> > > > > > + ret = -EPERM;
> >> > > > > > + goto out_err;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + /* check that the socket has never been connected or recently disconnected */
> >> > > > > > + if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
> >> > > > > > + ret = -EOPNOTSUPP;
> >> > > > > > + goto out_err;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + /* check that the socket is not bound to an interface*/
> >> > > > > > + if (sk->sk_bound_dev_if != 0) {
> >> > > > > > + ret = -EOPNOTSUPP;
> >> > > > > > + goto out_err;
> >> > > > > > + }
> >> > > > > > +
> >> > > > > > + my_ns = sock_net(sk);
> >> > > > > > + sock_net_set(sk, other_ns);
> >> > > > > > + put_net(my_ns);
> >> > > > > > + break;
> >> > > > >
> >> > > > > cpu 0 cpu 2
> >> > > > > --- ---
> >> > > > > ns = sock_net(sk);
> >> > > > > my_ns = sock_net(sk);
> >> > > > > sock_net_set(sk, other_ns);
> >> > > > > put_net(my_ns);
> >> > > > > ns is invalid ?
> >> > > >
> >> > > > That is the reason we want the socket to be in an un-connected state. That
> >> > > > should help us avoid this situation.
> >> > >
> >> > > This is not enough....
> >> > >
> >> > > Another thread might look at sock_net(sk), for example from inet_diag
> >> > > or tcp timers
> >> > > (which can be fired even in un-connected state)
> >> > >
> >> > > Even UDP sockets can receive packets while being un-connected,
> >> > > and they need to deref the net pointer.
> >> > >
> >> > > Currently there is no protection about sock_net(sk) being changed on the fly,
> >> > > and the struct net could disappear and be freed.
> >> > >
> >> > > There are ~1500 uses of sock_net(sk) in the kernel, I do not think
> >> > > you/we want to audit all
> >> > > of them to check what could go wrong...
> >> >
> >> > I agree, auditing all the uses of sock_net(sk) is not a feasible option. From my
> >> > exploration of the usage of sock_net(sk) it appeared that it might be safe to
> >> > swap a sockets net ns if it had never been connected but I looked at only a
> >> > subset of such uses.
> >> >
> >> > Introducing a ref counting logic to every access of sock_net(sk) may help get
> >> > around this but invovles a bigger change to increment and decrement the count at
> >> > every use of sock_net().
> >> >
> >> > Any suggestions if this could be achieved in another way much close to the
> >> > socket creation time or any comments on our workaround for injecting sockets using
> >> > seccomp addfd?
> >>
> >> Maybe the existing BPF hook in inet_create() could be used ?
> >>
> >> err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
> >>
> >> The BPF program might be able to switch the netns, because at this
> >> time the new socket is not
> >> yet visible from external threads.
> >>
> >> Although it is not going to catch dual stack uses (open a V6 socket,
> >> then use a v4mapped address at bind()/connect()/...
> >
> > We thought of a similar approach by intercepting the socket() call in seccomp
> > and injecting a new file descritpor much earlier but as you said we run into the
> > issue of handling dual stack sockets since we do not know in advance if its
> > going to be used for a v4mapped address.
>
> I would suggest adding a default ipv4 route from your ipv6 network
> namespaces to your ipv4 network namespace, but that only works for
> outbound traffic. The inbound traffic problem is classically solved
> via nat.
>
> That you are not suggesting using nat has me thinking there is something
> subtle in what you are trying to do that I am missing.
>
> Perhaps your userspace can do:
>
> previous_netns = open("/proc/self/ns/net");
> setns(ipv4_netns);
> socket();
> setns(previous_netns);
>
>
> As the network namespace is per thread this is atomic if you add
> the logic to block signals around it.
>
> Eric
That is correct, we are not using nat, but we are providing a mechanism for the
users of our container platform to move to ipv6 only while keeping egress
connectivity to their ipv4 destinations. We are doing this transparently without
any change in user code, but by intercept networking syscalls in a container
manager running in a dedicated ipv4 only network namespace. Our current solution
as described in my original commit message has limitations and we are looking
for a way to switch a sockets namespace from the ipv6 only container network
namespace to the dedicated ipv4 network namespace which really simplifies our
design.
Since our userspace is the container workload we have no control over how they
instantiate their sockets.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
2023-02-02 1:48 ` Hillf Danton
@ 2023-02-07 11:48 ` kernel test robot
2023-02-07 14:21 ` kernel test robot
2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2023-02-07 11:48 UTC (permalink / raw)
To: aloktiagi; +Cc: llvm, oe-kbuild-all
Hi aloktiagi,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes net/master linus/master v6.2-rc7]
[cannot apply to net-next/master next-20230207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
base: https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link: https://lore.kernel.org/r/Y9q8Ec1CJILZz7dj%40ip-172-31-38-16.us-west-2.compute.internal
patch subject: [RFC] net: add new socket option SO_SETNETNS
config: mips-mtx1_defconfig (https://download.01.org/0day-ci/archive/20230207/202302071930.65Ha05Kf-lkp@intel.com/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 4196ca3278f78c6e19246e54ab0ecb364e37d66a)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install mips cross compiling tool for clang build
# apt-get install binutils-mipsel-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/03eff302351f4db1ed733d7b303f3938e511413b
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
git checkout 03eff302351f4db1ed733d7b303f3938e511413b
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
>> net/core/sock.c:1538:7: error: use of undeclared identifier 'SO_SETNETNS'
case SO_SETNETNS:
^
1 error generated.
vim +/SO_SETNETNS +1538 net/core/sock.c
1426
1427 case SO_MAX_PACING_RATE:
1428 {
1429 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1430
1431 if (sizeof(ulval) != sizeof(val) &&
1432 optlen >= sizeof(ulval) &&
1433 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1434 ret = -EFAULT;
1435 break;
1436 }
1437 if (ulval != ~0UL)
1438 cmpxchg(&sk->sk_pacing_status,
1439 SK_PACING_NONE,
1440 SK_PACING_NEEDED);
1441 sk->sk_max_pacing_rate = ulval;
1442 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1443 break;
1444 }
1445 case SO_INCOMING_CPU:
1446 reuseport_update_incoming_cpu(sk, val);
1447 break;
1448
1449 case SO_CNX_ADVICE:
1450 if (val == 1)
1451 dst_negative_advice(sk);
1452 break;
1453
1454 case SO_ZEROCOPY:
1455 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1456 if (!(sk_is_tcp(sk) ||
1457 (sk->sk_type == SOCK_DGRAM &&
1458 sk->sk_protocol == IPPROTO_UDP)))
1459 ret = -EOPNOTSUPP;
1460 } else if (sk->sk_family != PF_RDS) {
1461 ret = -EOPNOTSUPP;
1462 }
1463 if (!ret) {
1464 if (val < 0 || val > 1)
1465 ret = -EINVAL;
1466 else
1467 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1468 }
1469 break;
1470
1471 case SO_TXTIME:
1472 if (optlen != sizeof(struct sock_txtime)) {
1473 ret = -EINVAL;
1474 break;
1475 } else if (copy_from_sockptr(&sk_txtime, optval,
1476 sizeof(struct sock_txtime))) {
1477 ret = -EFAULT;
1478 break;
1479 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1480 ret = -EINVAL;
1481 break;
1482 }
1483 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1484 * scheduler has enough safe guards.
1485 */
1486 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1487 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1488 ret = -EPERM;
1489 break;
1490 }
1491 sock_valbool_flag(sk, SOCK_TXTIME, true);
1492 sk->sk_clockid = sk_txtime.clockid;
1493 sk->sk_txtime_deadline_mode =
1494 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1495 sk->sk_txtime_report_errors =
1496 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1497 break;
1498
1499 case SO_BINDTOIFINDEX:
1500 ret = sock_bindtoindex_locked(sk, val);
1501 break;
1502
1503 case SO_BUF_LOCK:
1504 if (val & ~SOCK_BUF_LOCK_MASK) {
1505 ret = -EINVAL;
1506 break;
1507 }
1508 sk->sk_userlocks = val | (sk->sk_userlocks &
1509 ~SOCK_BUF_LOCK_MASK);
1510 break;
1511
1512 case SO_RESERVE_MEM:
1513 {
1514 int delta;
1515
1516 if (val < 0) {
1517 ret = -EINVAL;
1518 break;
1519 }
1520
1521 delta = val - sk->sk_reserved_mem;
1522 if (delta < 0)
1523 sock_release_reserved_memory(sk, -delta);
1524 else
1525 ret = sock_reserve_memory(sk, delta);
1526 break;
1527 }
1528
1529 case SO_TXREHASH:
1530 if (val < -1 || val > 1) {
1531 ret = -EINVAL;
1532 break;
1533 }
1534 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1535 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1536 break;
1537
> 1538 case SO_SETNETNS:
1539 {
1540 struct net *other_ns, *my_ns;
1541
1542 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
1543 ret = -EOPNOTSUPP;
1544 break;
1545 }
1546
1547 if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
1548 ret = -EOPNOTSUPP;
1549 break;
1550 }
1551
1552 other_ns = get_net_ns_by_fd(val);
1553 if (IS_ERR(other_ns)) {
1554 ret = PTR_ERR(other_ns);
1555 break;
1556 }
1557
1558 if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
1559 ret = -EPERM;
1560 goto out_err;
1561 }
1562
1563 /* check that the socket has never been connected or recently disconnected */
1564 if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
1565 ret = -EOPNOTSUPP;
1566 goto out_err;
1567 }
1568
1569 /* check that the socket is not bound to an interface*/
1570 if (sk->sk_bound_dev_if != 0) {
1571 ret = -EOPNOTSUPP;
1572 goto out_err;
1573 }
1574
1575 my_ns = sock_net(sk);
1576 sock_net_set(sk, other_ns);
1577 put_net(my_ns);
1578 break;
1579 out_err:
1580 put_net(other_ns);
1581 break;
1582 }
1583
1584 default:
1585 ret = -ENOPROTOOPT;
1586 break;
1587 }
1588 sockopt_release_sock(sk);
1589 return ret;
1590 }
1591
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [RFC] net: add new socket option SO_SETNETNS
2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
2023-02-02 1:48 ` Hillf Danton
2023-02-07 11:48 ` kernel test robot
@ 2023-02-07 14:21 ` kernel test robot
2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2023-02-07 14:21 UTC (permalink / raw)
To: aloktiagi; +Cc: oe-kbuild-all
Hi aloktiagi,
[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on shuah-kselftest/next]
[also build test ERROR on shuah-kselftest/fixes net/master linus/master v6.2-rc7]
[cannot apply to net-next/master next-20230207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
base: https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
patch link: https://lore.kernel.org/r/Y9q8Ec1CJILZz7dj%40ip-172-31-38-16.us-west-2.compute.internal
patch subject: [RFC] net: add new socket option SO_SETNETNS
config: mips-gcw0_defconfig (https://download.01.org/0day-ci/archive/20230207/202302072207.wLeEHX68-lkp@intel.com/config)
compiler: mipsel-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/03eff302351f4db1ed733d7b303f3938e511413b
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review aloktiagi/net-add-new-socket-option-SO_SETNETNS/20230202-032540
git checkout 03eff302351f4db1ed733d7b303f3938e511413b
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
net/core/sock.c: In function 'sk_setsockopt':
>> net/core/sock.c:1538:14: error: 'SO_SETNETNS' undeclared (first use in this function)
1538 | case SO_SETNETNS:
| ^~~~~~~~~~~
net/core/sock.c:1538:14: note: each undeclared identifier is reported only once for each function it appears in
vim +/SO_SETNETNS +1538 net/core/sock.c
1426
1427 case SO_MAX_PACING_RATE:
1428 {
1429 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1430
1431 if (sizeof(ulval) != sizeof(val) &&
1432 optlen >= sizeof(ulval) &&
1433 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1434 ret = -EFAULT;
1435 break;
1436 }
1437 if (ulval != ~0UL)
1438 cmpxchg(&sk->sk_pacing_status,
1439 SK_PACING_NONE,
1440 SK_PACING_NEEDED);
1441 sk->sk_max_pacing_rate = ulval;
1442 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1443 break;
1444 }
1445 case SO_INCOMING_CPU:
1446 reuseport_update_incoming_cpu(sk, val);
1447 break;
1448
1449 case SO_CNX_ADVICE:
1450 if (val == 1)
1451 dst_negative_advice(sk);
1452 break;
1453
1454 case SO_ZEROCOPY:
1455 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1456 if (!(sk_is_tcp(sk) ||
1457 (sk->sk_type == SOCK_DGRAM &&
1458 sk->sk_protocol == IPPROTO_UDP)))
1459 ret = -EOPNOTSUPP;
1460 } else if (sk->sk_family != PF_RDS) {
1461 ret = -EOPNOTSUPP;
1462 }
1463 if (!ret) {
1464 if (val < 0 || val > 1)
1465 ret = -EINVAL;
1466 else
1467 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1468 }
1469 break;
1470
1471 case SO_TXTIME:
1472 if (optlen != sizeof(struct sock_txtime)) {
1473 ret = -EINVAL;
1474 break;
1475 } else if (copy_from_sockptr(&sk_txtime, optval,
1476 sizeof(struct sock_txtime))) {
1477 ret = -EFAULT;
1478 break;
1479 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1480 ret = -EINVAL;
1481 break;
1482 }
1483 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1484 * scheduler has enough safe guards.
1485 */
1486 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1487 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1488 ret = -EPERM;
1489 break;
1490 }
1491 sock_valbool_flag(sk, SOCK_TXTIME, true);
1492 sk->sk_clockid = sk_txtime.clockid;
1493 sk->sk_txtime_deadline_mode =
1494 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1495 sk->sk_txtime_report_errors =
1496 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1497 break;
1498
1499 case SO_BINDTOIFINDEX:
1500 ret = sock_bindtoindex_locked(sk, val);
1501 break;
1502
1503 case SO_BUF_LOCK:
1504 if (val & ~SOCK_BUF_LOCK_MASK) {
1505 ret = -EINVAL;
1506 break;
1507 }
1508 sk->sk_userlocks = val | (sk->sk_userlocks &
1509 ~SOCK_BUF_LOCK_MASK);
1510 break;
1511
1512 case SO_RESERVE_MEM:
1513 {
1514 int delta;
1515
1516 if (val < 0) {
1517 ret = -EINVAL;
1518 break;
1519 }
1520
1521 delta = val - sk->sk_reserved_mem;
1522 if (delta < 0)
1523 sock_release_reserved_memory(sk, -delta);
1524 else
1525 ret = sock_reserve_memory(sk, delta);
1526 break;
1527 }
1528
1529 case SO_TXREHASH:
1530 if (val < -1 || val > 1) {
1531 ret = -EINVAL;
1532 break;
1533 }
1534 /* Paired with READ_ONCE() in tcp_rtx_synack() */
1535 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1536 break;
1537
> 1538 case SO_SETNETNS:
1539 {
1540 struct net *other_ns, *my_ns;
1541
1542 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
1543 ret = -EOPNOTSUPP;
1544 break;
1545 }
1546
1547 if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
1548 ret = -EOPNOTSUPP;
1549 break;
1550 }
1551
1552 other_ns = get_net_ns_by_fd(val);
1553 if (IS_ERR(other_ns)) {
1554 ret = PTR_ERR(other_ns);
1555 break;
1556 }
1557
1558 if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
1559 ret = -EPERM;
1560 goto out_err;
1561 }
1562
1563 /* check that the socket has never been connected or recently disconnected */
1564 if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
1565 ret = -EOPNOTSUPP;
1566 goto out_err;
1567 }
1568
1569 /* check that the socket is not bound to an interface*/
1570 if (sk->sk_bound_dev_if != 0) {
1571 ret = -EOPNOTSUPP;
1572 goto out_err;
1573 }
1574
1575 my_ns = sock_net(sk);
1576 sock_net_set(sk, other_ns);
1577 put_net(my_ns);
1578 break;
1579 out_err:
1580 put_net(other_ns);
1581 break;
1582 }
1583
1584 default:
1585 ret = -ENOPROTOOPT;
1586 break;
1587 }
1588 sockopt_release_sock(sk);
1589 return ret;
1590 }
1591
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2023-02-07 14:22 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-02-01 19:22 [RFC] net: add new socket option SO_SETNETNS aloktiagi
2023-02-02 1:48 ` Hillf Danton
2023-02-02 19:55 ` Alok Tiagi
2023-02-02 20:10 ` Eric Dumazet
2023-02-02 23:58 ` Alok Tiagi
2023-02-03 15:09 ` Eric Dumazet
2023-02-03 17:50 ` Alok Tiagi
2023-02-03 21:17 ` Eric W. Biederman
2023-02-04 18:44 ` Alok Tiagi
2023-02-07 11:48 ` kernel test robot
2023-02-07 14:21 ` kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.