* [PATCH v2 1/3] vsock: add network namespace support
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
@ 2025-03-12 20:59 ` Bobby Eshleman
2025-03-19 13:02 ` Stefano Garzarella
2025-03-12 20:59 ` [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets Bobby Eshleman
` (4 subsequent siblings)
5 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-12 20:59 UTC (permalink / raw)
To: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list
Cc: David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm, Bobby Eshleman
From: Stefano Garzarella <sgarzare@redhat.com>
This patch adds a check of the "net" assigned to a socket during
the vsock_find_bound_socket() and vsock_find_connected_socket()
to support network namespace, allowing to share the same address
(cid, port) across different network namespaces.
This patch preserves old behavior, and does not yet bring up namespace
support fully.
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
---
v1 -> v2:
* remove 'netns' module param
* remove vsock_net_eq()
* use vsock_global_net() for "global" namespace
* use fallback logic in socket lookup functions, giving precedence to
non-global vsock namespaces
RFC -> v1
* added 'netns' module param
* added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
---
include/net/af_vsock.h | 7 +++--
net/vmw_vsock/af_vsock.c | 55 ++++++++++++++++++++++++---------
net/vmw_vsock/hyperv_transport.c | 2 +-
net/vmw_vsock/virtio_transport_common.c | 5 +--
net/vmw_vsock/vmci_transport.c | 4 +--
5 files changed, 51 insertions(+), 22 deletions(-)
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 9e85424c834353d016a527070dd62e15ff3bfce1..41afbc18648c953da27a93571d408de968aa7668 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -213,9 +213,10 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
void vsock_insert_connected(struct vsock_sock *vsk);
void vsock_remove_bound(struct vsock_sock *vsk);
void vsock_remove_connected(struct vsock_sock *vsk);
-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net);
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
- struct sockaddr_vm *dst);
+ struct sockaddr_vm *dst,
+ struct net *net);
void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(struct vsock_transport *transport,
void (*fn)(struct sock *sk));
@@ -255,4 +256,6 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
{
return t->msgzerocopy_allow && t->msgzerocopy_allow();
}
+
+struct net *vsock_global_net(void);
#endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7e3db87ae4333cf63327ec105ca99253569bb9fe..d206489bf0a81cf989387c7c8063be91a7c21a7d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -235,37 +235,60 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
sock_put(&vsk->sk);
}
-static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
+struct net *vsock_global_net(void)
{
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(vsock_global_net);
+
+static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr,
+ struct net *net)
+{
+ struct sock *fallback = NULL;
struct vsock_sock *vsk;
list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
- if (vsock_addr_equals_addr(addr, &vsk->local_addr))
- return sk_vsock(vsk);
+ if (vsock_addr_equals_addr(addr, &vsk->local_addr)) {
+ if (net_eq(net, sock_net(sk_vsock(vsk))))
+ return sk_vsock(vsk);
+ if (net_eq(net, vsock_global_net()))
+ fallback = sk_vsock(vsk);
+ }
if (addr->svm_port == vsk->local_addr.svm_port &&
(vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
- addr->svm_cid == VMADDR_CID_ANY))
- return sk_vsock(vsk);
+ addr->svm_cid == VMADDR_CID_ANY)) {
+ if (net_eq(net, sock_net(sk_vsock(vsk))))
+ return sk_vsock(vsk);
+
+ if (net_eq(net, vsock_global_net()))
+ fallback = sk_vsock(vsk);
+ }
}
- return NULL;
+ return fallback;
}
static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
- struct sockaddr_vm *dst)
+ struct sockaddr_vm *dst,
+ struct net *net)
{
+ struct sock *fallback = NULL;
struct vsock_sock *vsk;
list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
connected_table) {
if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
dst->svm_port == vsk->local_addr.svm_port) {
- return sk_vsock(vsk);
+ if (net_eq(net, sock_net(sk_vsock(vsk))))
+ return sk_vsock(vsk);
+
+ if (net_eq(net, vsock_global_net()))
+ fallback = sk_vsock(vsk);
}
}
- return NULL;
+ return fallback;
}
static void vsock_insert_unbound(struct vsock_sock *vsk)
@@ -304,12 +327,12 @@ void vsock_remove_connected(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(vsock_remove_connected);
-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net)
{
struct sock *sk;
spin_lock_bh(&vsock_table_lock);
- sk = __vsock_find_bound_socket(addr);
+ sk = __vsock_find_bound_socket(addr, net);
if (sk)
sock_hold(sk);
@@ -320,12 +343,13 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
- struct sockaddr_vm *dst)
+ struct sockaddr_vm *dst,
+ struct net *net)
{
struct sock *sk;
spin_lock_bh(&vsock_table_lock);
- sk = __vsock_find_connected_socket(src, dst);
+ sk = __vsock_find_connected_socket(src, dst, net);
if (sk)
sock_hold(sk);
@@ -644,6 +668,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
{
static u32 port;
struct sockaddr_vm new_addr;
+ struct net *net = sock_net(sk_vsock(vsk));
if (!port)
port = get_random_u32_above(LAST_RESERVED_PORT);
@@ -660,7 +685,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
new_addr.svm_port = port++;
- if (!__vsock_find_bound_socket(&new_addr)) {
+ if (!__vsock_find_bound_socket(&new_addr, net)) {
found = true;
break;
}
@@ -677,7 +702,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
return -EACCES;
}
- if (__vsock_find_bound_socket(&new_addr))
+ if (__vsock_find_bound_socket(&new_addr, net))
return -EADDRINUSE;
}
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 31342ab502b4fc35feb812d2c94e0e35ded73771..253609898d24f8a484fcfc3296011c6f501a72a8 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -313,7 +313,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
return;
hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
- sk = vsock_find_bound_socket(&addr);
+ sk = vsock_find_bound_socket(&addr, NULL);
if (!sk)
return;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7f7de6d8809655fe522749fbbc9025df71f071bd..256d2a4fe482b3cb938a681b6924be69b2065616 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1590,6 +1590,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
struct sk_buff *skb)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+ struct net *net = vsock_global_net();
struct sockaddr_vm src, dst;
struct vsock_sock *vsk;
struct sock *sk;
@@ -1617,9 +1618,9 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
/* The socket must be in connected or bound table
* otherwise send reset back
*/
- sk = vsock_find_connected_socket(&src, &dst);
+ sk = vsock_find_connected_socket(&src, &dst, net);
if (!sk) {
- sk = vsock_find_bound_socket(&dst);
+ sk = vsock_find_bound_socket(&dst, net);
if (!sk) {
(void)virtio_transport_reset_no_sock(t, skb);
goto free_pkt;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index b370070194fa4ac0df45a073d389ffccf69a0029..373b9fe30a26c18aaa181fbc16db840d8f839b13 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -703,9 +703,9 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
- sk = vsock_find_connected_socket(&src, &dst);
+ sk = vsock_find_connected_socket(&src, &dst, NULL);
if (!sk) {
- sk = vsock_find_bound_socket(&dst);
+ sk = vsock_find_bound_socket(&dst, NULL);
if (!sk) {
/* We could not find a socket for this specified
* address. If this packet is a RST, we just drop it.
--
2.47.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v2 1/3] vsock: add network namespace support
2025-03-12 20:59 ` [PATCH v2 1/3] vsock: add network namespace support Bobby Eshleman
@ 2025-03-19 13:02 ` Stefano Garzarella
2025-03-19 19:00 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-19 13:02 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 12, 2025 at 01:59:35PM -0700, Bobby Eshleman wrote:
>From: Stefano Garzarella <sgarzare@redhat.com>
>
>This patch adds a check of the "net" assigned to a socket during
>the vsock_find_bound_socket() and vsock_find_connected_socket()
>to support network namespace, allowing to share the same address
>(cid, port) across different network namespaces.
>
>This patch preserves old behavior, and does not yet bring up namespace
>support fully.
>
>Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
I'd describe here a bit the new behaviour related to `fallback` that you
developed.
Or we can split this patch in two patches, one with my changes without
fallback, and another with fallback as you as author.
WDYT?
>Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>---
>v1 -> v2:
>* remove 'netns' module param
>* remove vsock_net_eq()
>* use vsock_global_net() for "global" namespace
>* use fallback logic in socket lookup functions, giving precedence to
> non-global vsock namespaces
>
>RFC -> v1
>* added 'netns' module param
>* added 'vsock_net_eq()' to check the "net" assigned to a socket
> only when 'netns' support is enabled
>---
> include/net/af_vsock.h | 7 +++--
> net/vmw_vsock/af_vsock.c | 55 ++++++++++++++++++++++++---------
> net/vmw_vsock/hyperv_transport.c | 2 +-
> net/vmw_vsock/virtio_transport_common.c | 5 +--
> net/vmw_vsock/vmci_transport.c | 4 +--
> 5 files changed, 51 insertions(+), 22 deletions(-)
>
>diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>index 9e85424c834353d016a527070dd62e15ff3bfce1..41afbc18648c953da27a93571d408de968aa7668 100644
>--- a/include/net/af_vsock.h
>+++ b/include/net/af_vsock.h
>@@ -213,9 +213,10 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
> void vsock_insert_connected(struct vsock_sock *vsk);
> void vsock_remove_bound(struct vsock_sock *vsk);
> void vsock_remove_connected(struct vsock_sock *vsk);
>-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
>+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net);
> struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
>- struct sockaddr_vm *dst);
>+ struct sockaddr_vm *dst,
>+ struct net *net);
> void vsock_remove_sock(struct vsock_sock *vsk);
> void vsock_for_each_connected_socket(struct vsock_transport *transport,
> void (*fn)(struct sock *sk));
>@@ -255,4 +256,6 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
> {
> return t->msgzerocopy_allow && t->msgzerocopy_allow();
> }
>+
>+struct net *vsock_global_net(void);
If it just returns null, maybe we can make it inline here.
> #endif /* __AF_VSOCK_H__ */
>diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>index 7e3db87ae4333cf63327ec105ca99253569bb9fe..d206489bf0a81cf989387c7c8063be91a7c21a7d 100644
>--- a/net/vmw_vsock/af_vsock.c
>+++ b/net/vmw_vsock/af_vsock.c
>@@ -235,37 +235,60 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
> sock_put(&vsk->sk);
> }
>
>-static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
>+struct net *vsock_global_net(void)
> {
>+ return NULL;
>+}
>+EXPORT_SYMBOL_GPL(vsock_global_net);
>+
>+static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr,
>+ struct net *net)
>+{
Please add a comment here to describe what fallback is used for.
And I would suggest also something on top of this file to explain a bit
how netns are handled in AF_VSOCK.
>+ struct sock *fallback = NULL;
> struct vsock_sock *vsk;
>
> list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
>- if (vsock_addr_equals_addr(addr, &vsk->local_addr))
>- return sk_vsock(vsk);
>+ if (vsock_addr_equals_addr(addr, &vsk->local_addr)) {
>+ if (net_eq(net, sock_net(sk_vsock(vsk))))
>+ return sk_vsock(vsk);
>
>+ if (net_eq(net, vsock_global_net()))
>+ fallback = sk_vsock(vsk);
>+ }
> if (addr->svm_port == vsk->local_addr.svm_port &&
> (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
>- addr->svm_cid == VMADDR_CID_ANY))
>- return sk_vsock(vsk);
>+ addr->svm_cid == VMADDR_CID_ANY)) {
>+ if (net_eq(net, sock_net(sk_vsock(vsk))))
>+ return sk_vsock(vsk);
>+
>+ if (net_eq(net, vsock_global_net()))
>+ fallback = sk_vsock(vsk);
>+ }
> }
>
>- return NULL;
>+ return fallback;
> }
>
> static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
>- struct sockaddr_vm *dst)
>+ struct sockaddr_vm *dst,
>+ struct net *net)
> {
>+ struct sock *fallback = NULL;
> struct vsock_sock *vsk;
>
> list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
> connected_table) {
> if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
> dst->svm_port == vsk->local_addr.svm_port) {
>- return sk_vsock(vsk);
>+ if (net_eq(net, sock_net(sk_vsock(vsk))))
>+ return sk_vsock(vsk);
>+
>+ if (net_eq(net, vsock_global_net()))
>+ fallback = sk_vsock(vsk);
This pattern seems to be repeated 3 times, can we make a function/macro?
> }
> }
>
>- return NULL;
>+ return fallback;
> }
>
> static void vsock_insert_unbound(struct vsock_sock *vsk)
>@@ -304,12 +327,12 @@ void vsock_remove_connected(struct vsock_sock *vsk)
> }
> EXPORT_SYMBOL_GPL(vsock_remove_connected);
>
>-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
>+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net)
> {
> struct sock *sk;
>
> spin_lock_bh(&vsock_table_lock);
>- sk = __vsock_find_bound_socket(addr);
>+ sk = __vsock_find_bound_socket(addr, net);
> if (sk)
> sock_hold(sk);
>
>@@ -320,12 +343,13 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
> EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
>
> struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
>- struct sockaddr_vm *dst)
>+ struct sockaddr_vm *dst,
>+ struct net *net)
> {
> struct sock *sk;
>
> spin_lock_bh(&vsock_table_lock);
>- sk = __vsock_find_connected_socket(src, dst);
>+ sk = __vsock_find_connected_socket(src, dst, net);
> if (sk)
> sock_hold(sk);
>
>@@ -644,6 +668,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
> {
> static u32 port;
> struct sockaddr_vm new_addr;
>+ struct net *net = sock_net(sk_vsock(vsk));
>
> if (!port)
> port = get_random_u32_above(LAST_RESERVED_PORT);
>@@ -660,7 +685,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
>
> new_addr.svm_port = port++;
>
>- if (!__vsock_find_bound_socket(&new_addr)) {
>+ if (!__vsock_find_bound_socket(&new_addr, net)) {
> found = true;
> break;
> }
>@@ -677,7 +702,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
> return -EACCES;
> }
>
>- if (__vsock_find_bound_socket(&new_addr))
>+ if (__vsock_find_bound_socket(&new_addr, net))
> return -EADDRINUSE;
> }
>
>diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
>index 31342ab502b4fc35feb812d2c94e0e35ded73771..253609898d24f8a484fcfc3296011c6f501a72a8 100644
>--- a/net/vmw_vsock/hyperv_transport.c
>+++ b/net/vmw_vsock/hyperv_transport.c
>@@ -313,7 +313,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
> return;
>
> hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
>- sk = vsock_find_bound_socket(&addr);
>+ sk = vsock_find_bound_socket(&addr, NULL);
> if (!sk)
> return;
>
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index 7f7de6d8809655fe522749fbbc9025df71f071bd..256d2a4fe482b3cb938a681b6924be69b2065616 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -1590,6 +1590,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> struct sk_buff *skb)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
>+ struct net *net = vsock_global_net();
Why using vsock_global_net() in virtio and directly NULL in the others
transports?
> struct sockaddr_vm src, dst;
> struct vsock_sock *vsk;
> struct sock *sk;
>@@ -1617,9 +1618,9 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> /* The socket must be in connected or bound table
> * otherwise send reset back
> */
>- sk = vsock_find_connected_socket(&src, &dst);
>+ sk = vsock_find_connected_socket(&src, &dst, net);
> if (!sk) {
>- sk = vsock_find_bound_socket(&dst);
>+ sk = vsock_find_bound_socket(&dst, net);
> if (!sk) {
> (void)virtio_transport_reset_no_sock(t, skb);
> goto free_pkt;
>diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
>index b370070194fa4ac0df45a073d389ffccf69a0029..373b9fe30a26c18aaa181fbc16db840d8f839b13 100644
>--- a/net/vmw_vsock/vmci_transport.c
>+++ b/net/vmw_vsock/vmci_transport.c
>@@ -703,9 +703,9 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
> vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
> vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
>
>- sk = vsock_find_connected_socket(&src, &dst);
>+ sk = vsock_find_connected_socket(&src, &dst, NULL);
> if (!sk) {
>- sk = vsock_find_bound_socket(&dst);
>+ sk = vsock_find_bound_socket(&dst, NULL);
> if (!sk) {
> /* We could not find a socket for this specified
> * address. If this packet is a RST, we just drop it.
>
>-- 2.47.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 1/3] vsock: add network namespace support
2025-03-19 13:02 ` Stefano Garzarella
@ 2025-03-19 19:00 ` Bobby Eshleman
2025-03-20 8:57 ` Stefano Garzarella
0 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-19 19:00 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 19, 2025 at 02:02:32PM +0100, Stefano Garzarella wrote:
> On Wed, Mar 12, 2025 at 01:59:35PM -0700, Bobby Eshleman wrote:
> > From: Stefano Garzarella <sgarzare@redhat.com>
> >
> > This patch adds a check of the "net" assigned to a socket during
> > the vsock_find_bound_socket() and vsock_find_connected_socket()
> > to support network namespace, allowing to share the same address
> > (cid, port) across different network namespaces.
> >
> > This patch preserves old behavior, and does not yet bring up namespace
> > support fully.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>
> I'd describe here a bit the new behaviour related to `fallback` that you
> developed.
>
> Or we can split this patch in two patches, one with my changes without
> fallback, and another with fallback as you as author.
>
> WDYT?
>
I like the idea of splitting it, that way any unforeseen issues in the
new logic can be isolated to the one patch.
>
> > Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
> > ---
> > v1 -> v2:
> > * remove 'netns' module param
> > * remove vsock_net_eq()
> > * use vsock_global_net() for "global" namespace
> > * use fallback logic in socket lookup functions, giving precedence to
> > non-global vsock namespaces
> >
> > RFC -> v1
> > * added 'netns' module param
> > * added 'vsock_net_eq()' to check the "net" assigned to a socket
> > only when 'netns' support is enabled
> > ---
> > include/net/af_vsock.h | 7 +++--
> > net/vmw_vsock/af_vsock.c | 55 ++++++++++++++++++++++++---------
> > net/vmw_vsock/hyperv_transport.c | 2 +-
> > net/vmw_vsock/virtio_transport_common.c | 5 +--
> > net/vmw_vsock/vmci_transport.c | 4 +--
> > 5 files changed, 51 insertions(+), 22 deletions(-)
> >
> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > index 9e85424c834353d016a527070dd62e15ff3bfce1..41afbc18648c953da27a93571d408de968aa7668 100644
> > --- a/include/net/af_vsock.h
> > +++ b/include/net/af_vsock.h
> > @@ -213,9 +213,10 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
> > void vsock_insert_connected(struct vsock_sock *vsk);
> > void vsock_remove_bound(struct vsock_sock *vsk);
> > void vsock_remove_connected(struct vsock_sock *vsk);
> > -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
> > +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net);
> > struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
> > - struct sockaddr_vm *dst);
> > + struct sockaddr_vm *dst,
> > + struct net *net);
> > void vsock_remove_sock(struct vsock_sock *vsk);
> > void vsock_for_each_connected_socket(struct vsock_transport *transport,
> > void (*fn)(struct sock *sk));
> > @@ -255,4 +256,6 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
> > {
> > return t->msgzerocopy_allow && t->msgzerocopy_allow();
> > }
> > +
> > +struct net *vsock_global_net(void);
>
> If it just returns null, maybe we can make it inline here.
>
Roger that.
> > #endif /* __AF_VSOCK_H__ */
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index 7e3db87ae4333cf63327ec105ca99253569bb9fe..d206489bf0a81cf989387c7c8063be91a7c21a7d 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -235,37 +235,60 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
> > sock_put(&vsk->sk);
> > }
> >
> > -static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
> > +struct net *vsock_global_net(void)
> > {
> > + return NULL;
> > +}
> > +EXPORT_SYMBOL_GPL(vsock_global_net);
> > +
> > +static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr,
> > + struct net *net)
> > +{
>
> Please add a comment here to describe what fallback is used for.
> And I would suggest also something on top of this file to explain a bit
> how netns are handled in AF_VSOCK.
>
sgtm!
> > + struct sock *fallback = NULL;
> > struct vsock_sock *vsk;
> >
> > list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
> > - if (vsock_addr_equals_addr(addr, &vsk->local_addr))
> > - return sk_vsock(vsk);
> > + if (vsock_addr_equals_addr(addr, &vsk->local_addr)) {
> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
> > + return sk_vsock(vsk);
> >
> > + if (net_eq(net, vsock_global_net()))
> > + fallback = sk_vsock(vsk);
> > + }
> > if (addr->svm_port == vsk->local_addr.svm_port &&
> > (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
> > - addr->svm_cid == VMADDR_CID_ANY))
> > - return sk_vsock(vsk);
> > + addr->svm_cid == VMADDR_CID_ANY)) {
> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
> > + return sk_vsock(vsk);
> > +
> > + if (net_eq(net, vsock_global_net()))
> > + fallback = sk_vsock(vsk);
> > + }
> > }
> >
> > - return NULL;
> > + return fallback;
> > }
> >
> > static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
> > - struct sockaddr_vm *dst)
> > + struct sockaddr_vm *dst,
> > + struct net *net)
> > {
> > + struct sock *fallback = NULL;
> > struct vsock_sock *vsk;
> >
> > list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
> > connected_table) {
> > if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
> > dst->svm_port == vsk->local_addr.svm_port) {
> > - return sk_vsock(vsk);
> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
> > + return sk_vsock(vsk);
> > +
> > + if (net_eq(net, vsock_global_net()))
> > + fallback = sk_vsock(vsk);
>
> This pattern seems to be repeated 3 times, can we make a function/macro?
>
yep, no problem!
> > }
> > }
> >
> > - return NULL;
> > + return fallback;
> > }
> >
> > static void vsock_insert_unbound(struct vsock_sock *vsk)
> > @@ -304,12 +327,12 @@ void vsock_remove_connected(struct vsock_sock *vsk)
> > }
> > EXPORT_SYMBOL_GPL(vsock_remove_connected);
> >
> > -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
> > +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net)
> > {
> > struct sock *sk;
> >
> > spin_lock_bh(&vsock_table_lock);
> > - sk = __vsock_find_bound_socket(addr);
> > + sk = __vsock_find_bound_socket(addr, net);
> > if (sk)
> > sock_hold(sk);
> >
> > @@ -320,12 +343,13 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
> > EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
> >
> > struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
> > - struct sockaddr_vm *dst)
> > + struct sockaddr_vm *dst,
> > + struct net *net)
> > {
> > struct sock *sk;
> >
> > spin_lock_bh(&vsock_table_lock);
> > - sk = __vsock_find_connected_socket(src, dst);
> > + sk = __vsock_find_connected_socket(src, dst, net);
> > if (sk)
> > sock_hold(sk);
> >
> > @@ -644,6 +668,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
> > {
> > static u32 port;
> > struct sockaddr_vm new_addr;
> > + struct net *net = sock_net(sk_vsock(vsk));
> >
> > if (!port)
> > port = get_random_u32_above(LAST_RESERVED_PORT);
> > @@ -660,7 +685,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
> >
> > new_addr.svm_port = port++;
> >
> > - if (!__vsock_find_bound_socket(&new_addr)) {
> > + if (!__vsock_find_bound_socket(&new_addr, net)) {
> > found = true;
> > break;
> > }
> > @@ -677,7 +702,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
> > return -EACCES;
> > }
> >
> > - if (__vsock_find_bound_socket(&new_addr))
> > + if (__vsock_find_bound_socket(&new_addr, net))
> > return -EADDRINUSE;
> > }
> >
> > diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
> > index 31342ab502b4fc35feb812d2c94e0e35ded73771..253609898d24f8a484fcfc3296011c6f501a72a8 100644
> > --- a/net/vmw_vsock/hyperv_transport.c
> > +++ b/net/vmw_vsock/hyperv_transport.c
> > @@ -313,7 +313,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
> > return;
> >
> > hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
> > - sk = vsock_find_bound_socket(&addr);
> > + sk = vsock_find_bound_socket(&addr, NULL);
> > if (!sk)
> > return;
> >
> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > index 7f7de6d8809655fe522749fbbc9025df71f071bd..256d2a4fe482b3cb938a681b6924be69b2065616 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -1590,6 +1590,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> > struct sk_buff *skb)
> > {
> > struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> > + struct net *net = vsock_global_net();
>
> Why using vsock_global_net() in virtio and directly NULL in the others
> transports?
>
This was an oversight on my part, I found an unnamed NULL harder to
reason about, switched to the func, but forgot to switch over the other
transports.
BTW, I was unsure about just making NULL a macro (e.g.,
VIRTIO_VSOCK_GLOBAL_NET?) instead of a function. I just used a function
because A) I noticed in the prior rev that the default net was a
function instead of some macro to &init_net, and B) the function seemed
a little more flexible for future changes. What are your thoughts here?
Thanks for the review!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 1/3] vsock: add network namespace support
2025-03-19 19:00 ` Bobby Eshleman
@ 2025-03-20 8:57 ` Stefano Garzarella
2025-03-20 20:56 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-20 8:57 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 19, 2025 at 12:00:38PM -0700, Bobby Eshleman wrote:
>On Wed, Mar 19, 2025 at 02:02:32PM +0100, Stefano Garzarella wrote:
>> On Wed, Mar 12, 2025 at 01:59:35PM -0700, Bobby Eshleman wrote:
>> > From: Stefano Garzarella <sgarzare@redhat.com>
>> >
>> > This patch adds a check of the "net" assigned to a socket during
>> > the vsock_find_bound_socket() and vsock_find_connected_socket()
>> > to support network namespace, allowing to share the same address
>> > (cid, port) across different network namespaces.
>> >
>> > This patch preserves old behavior, and does not yet bring up namespace
>> > support fully.
>> >
>> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>>
>> I'd describe here a bit the new behaviour related to `fallback` that you
>> developed.
>>
>> Or we can split this patch in two patches, one with my changes without
>> fallback, and another with fallback as you as author.
>>
>> WDYT?
>>
>
>I like the idea of splitting it, that way any unforeseen issues in the
>new logic can be isolated to the one patch.
>
>>
>> > Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>> > ---
>> > v1 -> v2:
>> > * remove 'netns' module param
>> > * remove vsock_net_eq()
>> > * use vsock_global_net() for "global" namespace
>> > * use fallback logic in socket lookup functions, giving precedence to
>> > non-global vsock namespaces
>> >
>> > RFC -> v1
>> > * added 'netns' module param
>> > * added 'vsock_net_eq()' to check the "net" assigned to a socket
>> > only when 'netns' support is enabled
>> > ---
>> > include/net/af_vsock.h | 7 +++--
>> > net/vmw_vsock/af_vsock.c | 55 ++++++++++++++++++++++++---------
>> > net/vmw_vsock/hyperv_transport.c | 2 +-
>> > net/vmw_vsock/virtio_transport_common.c | 5 +--
>> > net/vmw_vsock/vmci_transport.c | 4 +--
>> > 5 files changed, 51 insertions(+), 22 deletions(-)
>> >
>> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> > index 9e85424c834353d016a527070dd62e15ff3bfce1..41afbc18648c953da27a93571d408de968aa7668 100644
>> > --- a/include/net/af_vsock.h
>> > +++ b/include/net/af_vsock.h
>> > @@ -213,9 +213,10 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
>> > void vsock_insert_connected(struct vsock_sock *vsk);
>> > void vsock_remove_bound(struct vsock_sock *vsk);
>> > void vsock_remove_connected(struct vsock_sock *vsk);
>> > -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
>> > +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net);
>> > struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
>> > - struct sockaddr_vm *dst);
>> > + struct sockaddr_vm *dst,
>> > + struct net *net);
>> > void vsock_remove_sock(struct vsock_sock *vsk);
>> > void vsock_for_each_connected_socket(struct vsock_transport *transport,
>> > void (*fn)(struct sock *sk));
>> > @@ -255,4 +256,6 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
>> > {
>> > return t->msgzerocopy_allow && t->msgzerocopy_allow();
>> > }
>> > +
>> > +struct net *vsock_global_net(void);
>>
>> If it just returns null, maybe we can make it inline here.
>>
>
>Roger that.
>
>> > #endif /* __AF_VSOCK_H__ */
>> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> > index 7e3db87ae4333cf63327ec105ca99253569bb9fe..d206489bf0a81cf989387c7c8063be91a7c21a7d 100644
>> > --- a/net/vmw_vsock/af_vsock.c
>> > +++ b/net/vmw_vsock/af_vsock.c
>> > @@ -235,37 +235,60 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
>> > sock_put(&vsk->sk);
>> > }
>> >
>> > -static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
>> > +struct net *vsock_global_net(void)
>> > {
>> > + return NULL;
>> > +}
>> > +EXPORT_SYMBOL_GPL(vsock_global_net);
>> > +
>> > +static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr,
>> > + struct net *net)
>> > +{
>>
>> Please add a comment here to describe what fallback is used for.
>> And I would suggest also something on top of this file to explain a bit
>> how netns are handled in AF_VSOCK.
>>
>
>sgtm!
>
>> > + struct sock *fallback = NULL;
>> > struct vsock_sock *vsk;
>> >
>> > list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
>> > - if (vsock_addr_equals_addr(addr, &vsk->local_addr))
>> > - return sk_vsock(vsk);
>> > + if (vsock_addr_equals_addr(addr, &vsk->local_addr)) {
>> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
>> > + return sk_vsock(vsk);
>> >
>> > + if (net_eq(net, vsock_global_net()))
>> > + fallback = sk_vsock(vsk);
>> > + }
>> > if (addr->svm_port == vsk->local_addr.svm_port &&
>> > (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
>> > - addr->svm_cid == VMADDR_CID_ANY))
>> > - return sk_vsock(vsk);
>> > + addr->svm_cid == VMADDR_CID_ANY)) {
>> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
>> > + return sk_vsock(vsk);
>> > +
>> > + if (net_eq(net, vsock_global_net()))
>> > + fallback = sk_vsock(vsk);
>> > + }
>> > }
>> >
>> > - return NULL;
>> > + return fallback;
>> > }
>> >
>> > static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
>> > - struct sockaddr_vm *dst)
>> > + struct sockaddr_vm *dst,
>> > + struct net *net)
>> > {
>> > + struct sock *fallback = NULL;
>> > struct vsock_sock *vsk;
>> >
>> > list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
>> > connected_table) {
>> > if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
>> > dst->svm_port == vsk->local_addr.svm_port) {
>> > - return sk_vsock(vsk);
>> > + if (net_eq(net, sock_net(sk_vsock(vsk))))
>> > + return sk_vsock(vsk);
>> > +
>> > + if (net_eq(net, vsock_global_net()))
>> > + fallback = sk_vsock(vsk);
>>
>> This pattern seems to be repeated 3 times, can we make a function/macro?
>>
>
>yep, no problem!
>
>> > }
>> > }
>> >
>> > - return NULL;
>> > + return fallback;
>> > }
>> >
>> > static void vsock_insert_unbound(struct vsock_sock *vsk)
>> > @@ -304,12 +327,12 @@ void vsock_remove_connected(struct vsock_sock *vsk)
>> > }
>> > EXPORT_SYMBOL_GPL(vsock_remove_connected);
>> >
>> > -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
>> > +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr, struct net *net)
>> > {
>> > struct sock *sk;
>> >
>> > spin_lock_bh(&vsock_table_lock);
>> > - sk = __vsock_find_bound_socket(addr);
>> > + sk = __vsock_find_bound_socket(addr, net);
>> > if (sk)
>> > sock_hold(sk);
>> >
>> > @@ -320,12 +343,13 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
>> > EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
>> >
>> > struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
>> > - struct sockaddr_vm *dst)
>> > + struct sockaddr_vm *dst,
>> > + struct net *net)
>> > {
>> > struct sock *sk;
>> >
>> > spin_lock_bh(&vsock_table_lock);
>> > - sk = __vsock_find_connected_socket(src, dst);
>> > + sk = __vsock_find_connected_socket(src, dst, net);
>> > if (sk)
>> > sock_hold(sk);
>> >
>> > @@ -644,6 +668,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
>> > {
>> > static u32 port;
>> > struct sockaddr_vm new_addr;
>> > + struct net *net = sock_net(sk_vsock(vsk));
>> >
>> > if (!port)
>> > port = get_random_u32_above(LAST_RESERVED_PORT);
>> > @@ -660,7 +685,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
>> >
>> > new_addr.svm_port = port++;
>> >
>> > - if (!__vsock_find_bound_socket(&new_addr)) {
>> > + if (!__vsock_find_bound_socket(&new_addr, net)) {
>> > found = true;
>> > break;
>> > }
>> > @@ -677,7 +702,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
>> > return -EACCES;
>> > }
>> >
>> > - if (__vsock_find_bound_socket(&new_addr))
>> > + if (__vsock_find_bound_socket(&new_addr, net))
>> > return -EADDRINUSE;
>> > }
>> >
>> > diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
>> > index 31342ab502b4fc35feb812d2c94e0e35ded73771..253609898d24f8a484fcfc3296011c6f501a72a8 100644
>> > --- a/net/vmw_vsock/hyperv_transport.c
>> > +++ b/net/vmw_vsock/hyperv_transport.c
>> > @@ -313,7 +313,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
>> > return;
>> >
>> > hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
>> > - sk = vsock_find_bound_socket(&addr);
>> > + sk = vsock_find_bound_socket(&addr, NULL);
>> > if (!sk)
>> > return;
>> >
>> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> > index 7f7de6d8809655fe522749fbbc9025df71f071bd..256d2a4fe482b3cb938a681b6924be69b2065616 100644
>> > --- a/net/vmw_vsock/virtio_transport_common.c
>> > +++ b/net/vmw_vsock/virtio_transport_common.c
>> > @@ -1590,6 +1590,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
>> > struct sk_buff *skb)
>> > {
>> > struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
>> > + struct net *net = vsock_global_net();
>>
>> Why using vsock_global_net() in virtio and directly NULL in the others
>> transports?
>>
>
>This was an oversight on my part, I found an unnamed NULL harder to
>reason about, switched to the func, but forgot to switch over the other
>transports.
>
>BTW, I was unsure about just making NULL a macro (e.g.,
>VIRTIO_VSOCK_GLOBAL_NET?) instead of a function. I just used a function
>because A) I noticed in the prior rev that the default net was a
>function instead of some macro to &init_net, and B) the function seemed
>a little more flexible for future changes. What are your thoughts here?
Inline function in the header should be fine IMHO.
Thanks,
Stefano
>
>
>Thanks for the review!
>
>Best,
>Bobby
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 1/3] vsock: add network namespace support
2025-03-20 8:57 ` Stefano Garzarella
@ 2025-03-20 20:56 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-20 20:56 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Thu, Mar 20, 2025 at 09:57:57AM +0100, Stefano Garzarella wrote:
> On Wed, Mar 19, 2025 at 12:00:38PM -0700, Bobby Eshleman wrote:
> > On Wed, Mar 19, 2025 at 02:02:32PM +0100, Stefano Garzarella wrote:
> > > On Wed, Mar 12, 2025 at 01:59:35PM -0700, Bobby Eshleman wrote:
> > > > From: Stefano Garzarella <sgarzare@redhat.com>
> >
> > BTW, I was unsure about just making NULL a macro (e.g.,
> > VIRTIO_VSOCK_GLOBAL_NET?) instead of a function. I just used a function
> > because A) I noticed in the prior rev that the default net was a
> > function instead of some macro to &init_net, and B) the function seemed
> > a little more flexible for future changes. What are your thoughts here?
>
> Inline function in the header should be fine IMHO.
>
> Thanks,
> Stefano
>
Got it, thanks!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
2025-03-12 20:59 ` [PATCH v2 1/3] vsock: add network namespace support Bobby Eshleman
@ 2025-03-12 20:59 ` Bobby Eshleman
2025-03-19 13:26 ` Stefano Garzarella
2025-03-12 20:59 ` [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device Bobby Eshleman
` (3 subsequent siblings)
5 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-12 20:59 UTC (permalink / raw)
To: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list
Cc: David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm, Bobby Eshleman
From: Stefano Garzarella <sgarzare@redhat.com>
This patch allows transports that use virtio_transport_common
to specify the network namespace where a received packet is to
be delivered.
virtio_transport and vhost_transport, for now, still do not use this
capability and preserve old behavior.
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
---
V1 -> V2
* use vsock_global_net()
* add net to skb->cb
* forward port for skb
---
drivers/vhost/vsock.c | 1 +
include/linux/virtio_vsock.h | 2 ++
net/vmw_vsock/virtio_transport.c | 1 +
net/vmw_vsock/virtio_transport_common.c | 11 ++++++++++-
4 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 802153e230730bdbfbbb6f4ae263ae99502ef532..02e2a3551205a4398a74a167a82802d950c962f6 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -525,6 +525,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
continue;
}
+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
total_len += sizeof(*hdr) + skb->len;
/* Deliver to monitoring devices all received packets */
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 0387d64e2c66c69dd7ab0cad58db5cf0682ad424..e51f89559a1d92685027bf83a62c7b05dd9e566d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
struct virtio_vsock_skb_cb {
bool reply;
bool tap_delivered;
+ struct net *net;
u32 offset;
};
@@ -148,6 +149,7 @@ struct virtio_vsock_pkt_info {
u32 remote_cid, remote_port;
struct vsock_sock *vsk;
struct msghdr *msg;
+ struct net *net;
u32 pkt_len;
u16 type;
u16 op;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f0e48e6911fc46cba87f7dafeb8dbc21421df254..163ddfc0808529ad6dda7992f9ec48837dd7337c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -650,6 +650,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
virtio_vsock_skb_rx_put(skb);
virtio_transport_deliver_tap_pkt(skb);
+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
virtio_transport_recv_pkt(&virtio_transport, skb);
}
} while (!virtqueue_enable_cb(vq));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 256d2a4fe482b3cb938a681b6924be69b2065616..028591a5863b84059b8e8bbafd499cb997a0c863 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -314,6 +314,8 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *
info->flags,
zcopy);
+ VIRTIO_VSOCK_SKB_CB(skb)->net = info->net;
+
return skb;
out:
kfree_skb(skb);
@@ -523,6 +525,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1061,6 +1064,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1076,6 +1080,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
(mode & SEND_SHUTDOWN ?
VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1102,6 +1107,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
.msg = msg,
.pkt_len = len,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1139,6 +1145,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
.op = VIRTIO_VSOCK_OP_RST,
.reply = !!skb,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
/* Send RST only if the original pkt is not a RST pkt */
@@ -1159,6 +1166,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
.op = VIRTIO_VSOCK_OP_RST,
.type = le16_to_cpu(hdr->type),
.reply = true,
+ .net = VIRTIO_VSOCK_SKB_CB(skb)->net,
};
struct sk_buff *reply;
@@ -1476,6 +1484,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
.remote_port = le32_to_cpu(hdr->src_port),
.reply = true,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1590,7 +1599,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
struct sk_buff *skb)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
- struct net *net = vsock_global_net();
+ struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
struct sockaddr_vm src, dst;
struct vsock_sock *vsk;
struct sock *sk;
--
2.47.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets
2025-03-12 20:59 ` [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets Bobby Eshleman
@ 2025-03-19 13:26 ` Stefano Garzarella
2025-03-19 19:05 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-19 13:26 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 12, 2025 at 01:59:36PM -0700, Bobby Eshleman wrote:
>From: Stefano Garzarella <sgarzare@redhat.com>
>
>This patch allows transports that use virtio_transport_common
>to specify the network namespace where a received packet is to
>be delivered.
>
>virtio_transport and vhost_transport, for now, still do not use this
>capability and preserve old behavior.
What about vsock_loopback?
>
>Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>---
>V1 -> V2
> * use vsock_global_net()
> * add net to skb->cb
> * forward port for skb
>---
> drivers/vhost/vsock.c | 1 +
> include/linux/virtio_vsock.h | 2 ++
> net/vmw_vsock/virtio_transport.c | 1 +
> net/vmw_vsock/virtio_transport_common.c | 11 ++++++++++-
> 4 files changed, 14 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 802153e230730bdbfbbb6f4ae263ae99502ef532..02e2a3551205a4398a74a167a82802d950c962f6 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -525,6 +525,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> continue;
> }
>
>+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
I'd add an helper for that.
Or, can we avoid that and pass the net parameter to
virtio_transport_recv_pkt()?
> total_len += sizeof(*hdr) + skb->len;
>
> /* Deliver to monitoring devices all received packets */
>diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>index 0387d64e2c66c69dd7ab0cad58db5cf0682ad424..e51f89559a1d92685027bf83a62c7b05dd9e566d 100644
>--- a/include/linux/virtio_vsock.h
>+++ b/include/linux/virtio_vsock.h
>@@ -12,6 +12,7 @@
> struct virtio_vsock_skb_cb {
> bool reply;
> bool tap_delivered;
>+ struct net *net;
> u32 offset;
> };
>
>@@ -148,6 +149,7 @@ struct virtio_vsock_pkt_info {
> u32 remote_cid, remote_port;
> struct vsock_sock *vsk;
> struct msghdr *msg;
>+ struct net *net;
> u32 pkt_len;
> u16 type;
> u16 op;
>diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>index f0e48e6911fc46cba87f7dafeb8dbc21421df254..163ddfc0808529ad6dda7992f9ec48837dd7337c 100644
>--- a/net/vmw_vsock/virtio_transport.c
>+++ b/net/vmw_vsock/virtio_transport.c
>@@ -650,6 +650,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
>
> virtio_vsock_skb_rx_put(skb);
> virtio_transport_deliver_tap_pkt(skb);
>+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
> virtio_transport_recv_pkt(&virtio_transport, skb);
> }
> } while (!virtqueue_enable_cb(vq));
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index 256d2a4fe482b3cb938a681b6924be69b2065616..028591a5863b84059b8e8bbafd499cb997a0c863 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -314,6 +314,8 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *
> info->flags,
> zcopy);
>
>+ VIRTIO_VSOCK_SKB_CB(skb)->net = info->net;
>+
> return skb;
> out:
> kfree_skb(skb);
>@@ -523,6 +525,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
> struct virtio_vsock_pkt_info info = {
> .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1061,6 +1064,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
> struct virtio_vsock_pkt_info info = {
> .op = VIRTIO_VSOCK_OP_REQUEST,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1076,6 +1080,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
> (mode & SEND_SHUTDOWN ?
> VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1102,6 +1107,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
> .msg = msg,
> .pkt_len = len,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1139,6 +1145,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
> .op = VIRTIO_VSOCK_OP_RST,
> .reply = !!skb,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> /* Send RST only if the original pkt is not a RST pkt */
>@@ -1159,6 +1166,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
> .op = VIRTIO_VSOCK_OP_RST,
> .type = le16_to_cpu(hdr->type),
> .reply = true,
>+ .net = VIRTIO_VSOCK_SKB_CB(skb)->net,
> };
> struct sk_buff *reply;
>
>@@ -1476,6 +1484,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
> .remote_port = le32_to_cpu(hdr->src_port),
> .reply = true,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1590,7 +1599,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> struct sk_buff *skb)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
>- struct net *net = vsock_global_net();
>+ struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
> struct sockaddr_vm src, dst;
> struct vsock_sock *vsk;
> struct sock *sk;
>
>--
>2.47.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets
2025-03-19 13:26 ` Stefano Garzarella
@ 2025-03-19 19:05 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-19 19:05 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 19, 2025 at 02:26:04PM +0100, Stefano Garzarella wrote:
> On Wed, Mar 12, 2025 at 01:59:36PM -0700, Bobby Eshleman wrote:
> > From: Stefano Garzarella <sgarzare@redhat.com>
> >
> > This patch allows transports that use virtio_transport_common
> > to specify the network namespace where a received packet is to
> > be delivered.
> >
> > virtio_transport and vhost_transport, for now, still do not use this
> > capability and preserve old behavior.
>
> What about vsock_loopback?
>
Also unaffected, I'll add that comment here too.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
> > ---
> > V1 -> V2
> > * use vsock_global_net()
> > * add net to skb->cb
> > * forward port for skb
> > ---
> > drivers/vhost/vsock.c | 1 +
> > include/linux/virtio_vsock.h | 2 ++
> > net/vmw_vsock/virtio_transport.c | 1 +
> > net/vmw_vsock/virtio_transport_common.c | 11 ++++++++++-
> > 4 files changed, 14 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index 802153e230730bdbfbbb6f4ae263ae99502ef532..02e2a3551205a4398a74a167a82802d950c962f6 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -525,6 +525,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> > continue;
> > }
> >
> > + VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
>
> I'd add an helper for that.
>
> Or, can we avoid that and pass the net parameter to
> virtio_transport_recv_pkt()?
>
Makes sense. I like that passing it in reduces the places that cb->net is
assigned.
> > total_len += sizeof(*hdr) + skb->len;
> >
> > /* Deliver to monitoring devices all received packets */
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index 0387d64e2c66c69dd7ab0cad58db5cf0682ad424..e51f89559a1d92685027bf83a62c7b05dd9e566d 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -12,6 +12,7 @@
> > struct virtio_vsock_skb_cb {
> > bool reply;
> > bool tap_delivered;
> > + struct net *net;
> > u32 offset;
> > };
> >
> > @@ -148,6 +149,7 @@ struct virtio_vsock_pkt_info {
> > u32 remote_cid, remote_port;
> > struct vsock_sock *vsk;
> > struct msghdr *msg;
> > + struct net *net;
> > u32 pkt_len;
> > u16 type;
> > u16 op;
> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > index f0e48e6911fc46cba87f7dafeb8dbc21421df254..163ddfc0808529ad6dda7992f9ec48837dd7337c 100644
> > --- a/net/vmw_vsock/virtio_transport.c
> > +++ b/net/vmw_vsock/virtio_transport.c
> > @@ -650,6 +650,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
> >
> > virtio_vsock_skb_rx_put(skb);
> > virtio_transport_deliver_tap_pkt(skb);
> > + VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
> > virtio_transport_recv_pkt(&virtio_transport, skb);
> > }
> > } while (!virtqueue_enable_cb(vq));
> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > index 256d2a4fe482b3cb938a681b6924be69b2065616..028591a5863b84059b8e8bbafd499cb997a0c863 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -314,6 +314,8 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *
> > info->flags,
> > zcopy);
> >
> > + VIRTIO_VSOCK_SKB_CB(skb)->net = info->net;
> > +
> > return skb;
> > out:
> > kfree_skb(skb);
> > @@ -523,6 +525,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
> > struct virtio_vsock_pkt_info info = {
> > .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > return virtio_transport_send_pkt_info(vsk, &info);
> > @@ -1061,6 +1064,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
> > struct virtio_vsock_pkt_info info = {
> > .op = VIRTIO_VSOCK_OP_REQUEST,
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > return virtio_transport_send_pkt_info(vsk, &info);
> > @@ -1076,6 +1080,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
> > (mode & SEND_SHUTDOWN ?
> > VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > return virtio_transport_send_pkt_info(vsk, &info);
> > @@ -1102,6 +1107,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
> > .msg = msg,
> > .pkt_len = len,
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > return virtio_transport_send_pkt_info(vsk, &info);
> > @@ -1139,6 +1145,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
> > .op = VIRTIO_VSOCK_OP_RST,
> > .reply = !!skb,
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > /* Send RST only if the original pkt is not a RST pkt */
> > @@ -1159,6 +1166,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
> > .op = VIRTIO_VSOCK_OP_RST,
> > .type = le16_to_cpu(hdr->type),
> > .reply = true,
> > + .net = VIRTIO_VSOCK_SKB_CB(skb)->net,
> > };
> > struct sk_buff *reply;
> >
> > @@ -1476,6 +1484,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
> > .remote_port = le32_to_cpu(hdr->src_port),
> > .reply = true,
> > .vsk = vsk,
> > + .net = sock_net(sk_vsock(vsk)),
> > };
> >
> > return virtio_transport_send_pkt_info(vsk, &info);
> > @@ -1590,7 +1599,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> > struct sk_buff *skb)
> > {
> > struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> > - struct net *net = vsock_global_net();
> > + struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
> > struct sockaddr_vm src, dst;
> > struct vsock_sock *vsk;
> > struct sock *sk;
> >
> > --
> > 2.47.1
> >
>
Thanks!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
2025-03-12 20:59 ` [PATCH v2 1/3] vsock: add network namespace support Bobby Eshleman
2025-03-12 20:59 ` [PATCH v2 2/3] vsock/virtio_transport_common: handle netns of received packets Bobby Eshleman
@ 2025-03-12 20:59 ` Bobby Eshleman
2025-03-19 14:15 ` Stefano Garzarella
2025-03-19 21:09 ` Paolo Abeni
2025-03-13 2:28 ` [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
` (2 subsequent siblings)
5 siblings, 2 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-12 20:59 UTC (permalink / raw)
To: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list
Cc: David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm, Bobby Eshleman
From: Stefano Garzarella <sgarzare@redhat.com>
This patch assigns the network namespace of the process that opened
vhost-vsock-netns device (e.g. VMM) to the packets coming from the
guest, allowing only host sockets in the same network namespace to
communicate with the guest.
This patch also allows having different VMs, running in different
network namespace, with the same CID/port.
This patch brings namespace support only to vhost-vsock, but not
to virtio-vsock, hyperv, or vmci.
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
---
v1 -> v2
* removed 'netns' module param
* renamed vsock_default_net() to vsock_global_net() to reflect new
semantics
* reserved NULL net to indicate "global" vsock namespace
RFC -> v1
* added 'netns' module param
* added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
---
drivers/vhost/vsock.c | 97 +++++++++++++++++++++++++++++++++-------
include/linux/miscdevice.h | 1 +
include/net/af_vsock.h | 3 +-
net/vmw_vsock/af_vsock.c | 30 ++++++++++++-
net/vmw_vsock/virtio_transport.c | 4 +-
net/vmw_vsock/vsock_loopback.c | 4 +-
6 files changed, 117 insertions(+), 22 deletions(-)
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 02e2a3551205a4398a74a167a82802d950c962f6..8702beb8238aa290b6d901e53c36481637840017 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -46,6 +46,7 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
struct vhost_vsock {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
+ struct net *net;
/* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
struct hlist_node hash;
@@ -67,8 +68,9 @@ static u32 vhost_transport_get_local_cid(void)
/* Callers that dereference the return value must hold vhost_vsock_mutex or the
* RCU read lock.
*/
-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net, bool global_fallback)
{
+ struct vhost_vsock *fallback = NULL;
struct vhost_vsock *vsock;
hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
@@ -78,11 +80,18 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
if (other_cid == 0)
continue;
- if (other_cid == guest_cid)
- return vsock;
+ if (other_cid == guest_cid) {
+ if (net_eq(net, vsock->net))
+ return vsock;
+ if (net_eq(vsock->net, vsock_global_net()))
+ fallback = vsock;
+ }
}
+ if (global_fallback)
+ return fallback;
+
return NULL;
}
@@ -272,13 +281,14 @@ static int
vhost_transport_send_pkt(struct sk_buff *skb)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+ struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
struct vhost_vsock *vsock;
int len = skb->len;
rcu_read_lock();
/* Find the vhost_vsock according to guest context id */
- vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
+ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net, true);
if (!vsock) {
rcu_read_unlock();
kfree_skb(skb);
@@ -305,7 +315,8 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
rcu_read_lock();
/* Find the vhost_vsock according to guest context id */
- vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,
+ sock_net(sk_vsock(vsk)), true);
if (!vsock)
goto out;
@@ -403,7 +414,7 @@ static bool vhost_transport_msgzerocopy_allow(void)
return true;
}
-static bool vhost_transport_seqpacket_allow(u32 remote_cid);
+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
static struct virtio_transport vhost_transport = {
.transport = {
@@ -459,13 +470,14 @@ static struct virtio_transport vhost_transport = {
.send_pkt = vhost_transport_send_pkt,
};
-static bool vhost_transport_seqpacket_allow(u32 remote_cid)
+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
+ struct net *net = sock_net(sk_vsock(vsk));
struct vhost_vsock *vsock;
bool seqpacket_allow = false;
rcu_read_lock();
- vsock = vhost_vsock_get(remote_cid);
+ vsock = vhost_vsock_get(remote_cid, net, true);
if (vsock)
seqpacket_allow = vsock->seqpacket_allow;
@@ -525,7 +537,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
continue;
}
- VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock->net;
total_len += sizeof(*hdr) + skb->len;
/* Deliver to monitoring devices all received packets */
@@ -650,7 +662,7 @@ static void vhost_vsock_free(struct vhost_vsock *vsock)
kvfree(vsock);
}
-static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+static int __vhost_vsock_dev_open(struct inode *inode, struct file *file, struct net *net)
{
struct vhost_virtqueue **vqs;
struct vhost_vsock *vsock;
@@ -669,6 +681,8 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
goto out;
}
+ vsock->net = net;
+
vsock->guest_cid = 0; /* no CID assigned yet */
vsock->seqpacket_allow = false;
@@ -693,6 +707,22 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
return ret;
}
+static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+{
+ return __vhost_vsock_dev_open(inode, file, vsock_global_net());
+}
+
+static int vhost_vsock_netns_dev_open(struct inode *inode, struct file *file)
+{
+ struct net *net;
+
+ net = get_net_ns_by_pid(current->pid);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ return __vhost_vsock_dev_open(inode, file, net);
+}
+
static void vhost_vsock_flush(struct vhost_vsock *vsock)
{
vhost_dev_flush(&vsock->dev);
@@ -708,7 +738,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
*/
/* If the peer is still valid, no need to reset connection */
- if (vhost_vsock_get(vsk->remote_addr.svm_cid))
+ if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk), true))
return;
/* If the close timeout is pending, let it expire. This avoids races
@@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
vhost_dev_cleanup(&vsock->dev);
+ if (vsock->net)
+ put_net(vsock->net);
kfree(vsock->dev.vqs);
vhost_vsock_free(vsock);
return 0;
@@ -777,9 +809,15 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
if (vsock_find_cid(guest_cid))
return -EADDRINUSE;
+ /* If this namespace has already connected to this CID, then report
+ * that this address is already in use.
+ */
+ if (vsock->net && vsock_net_has_connected(vsock->net, guest_cid))
+ return -EADDRINUSE;
+
/* Refuse if CID is already in use */
mutex_lock(&vhost_vsock_mutex);
- other = vhost_vsock_get(guest_cid);
+ other = vhost_vsock_get(guest_cid, vsock->net, false);
if (other && other != vsock) {
mutex_unlock(&vhost_vsock_mutex);
return -EADDRINUSE;
@@ -931,6 +969,24 @@ static struct miscdevice vhost_vsock_misc = {
.fops = &vhost_vsock_fops,
};
+static const struct file_operations vhost_vsock_netns_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_vsock_netns_dev_open,
+ .release = vhost_vsock_dev_release,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = vhost_vsock_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .read_iter = vhost_vsock_chr_read_iter,
+ .write_iter = vhost_vsock_chr_write_iter,
+ .poll = vhost_vsock_chr_poll,
+};
+
+static struct miscdevice vhost_vsock_netns_misc = {
+ .minor = VHOST_VSOCK_NETNS_MINOR,
+ .name = "vhost-vsock-netns",
+ .fops = &vhost_vsock_netns_fops,
+};
+
static int __init vhost_vsock_init(void)
{
int ret;
@@ -941,17 +997,26 @@ static int __init vhost_vsock_init(void)
return ret;
ret = misc_register(&vhost_vsock_misc);
- if (ret) {
- vsock_core_unregister(&vhost_transport.transport);
- return ret;
- }
+ if (ret)
+ goto out_vt;
+
+ ret = misc_register(&vhost_vsock_netns_misc);
+ if (ret)
+ goto out_vvm;
return 0;
+
+out_vvm:
+ misc_deregister(&vhost_vsock_misc);
+out_vt:
+ vsock_core_unregister(&vhost_transport.transport);
+ return ret;
};
static void __exit vhost_vsock_exit(void)
{
misc_deregister(&vhost_vsock_misc);
+ misc_deregister(&vhost_vsock_netns_misc);
vsock_core_unregister(&vhost_transport.transport);
};
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 69e110c2b86a9b16c1637778a25e1eebb3fd0111..a7e11b70a5398a225c4d63d50ac460e6388e022c 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -71,6 +71,7 @@
#define USERIO_MINOR 240
#define VHOST_VSOCK_MINOR 241
#define RFKILL_MINOR 242
+#define VHOST_VSOCK_NETNS_MINOR 243
#define MISC_DYNAMIC_MINOR 255
struct device;
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 41afbc18648c953da27a93571d408de968aa7668..1e37737689a741d91e64b8c0ed0a931fc7376194 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -143,7 +143,7 @@ struct vsock_transport {
int flags);
int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
size_t len);
- bool (*seqpacket_allow)(u32 remote_cid);
+ bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
/* Notification. */
@@ -258,4 +258,5 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
}
struct net *vsock_global_net(void);
+bool vsock_net_has_connected(struct net *net, u64 guest_cid);
#endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d206489bf0a81cf989387c7c8063be91a7c21a7d..58fa415555d6aae5043b5ca2bfc4783af566cf28 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -391,6 +391,34 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport,
}
EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
+bool vsock_net_has_connected(struct net *net, u64 guest_cid)
+{
+ bool ret = false;
+ int i;
+
+ spin_lock_bh(&vsock_table_lock);
+
+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
+ struct vsock_sock *vsk;
+
+ list_for_each_entry(vsk, &vsock_connected_table[i],
+ connected_table) {
+ if (sock_net(sk_vsock(vsk)) != net)
+ continue;
+
+ if (vsk->remote_addr.svm_cid == guest_cid) {
+ ret = true;
+ goto out;
+ }
+ }
+ }
+
+out:
+ spin_unlock_bh(&vsock_table_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vsock_net_has_connected);
+
void vsock_add_pending(struct sock *listener, struct sock *pending)
{
struct vsock_sock *vlistener;
@@ -537,7 +565,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
if (sk->sk_type == SOCK_SEQPACKET) {
if (!new_transport->seqpacket_allow ||
- !new_transport->seqpacket_allow(remote_cid)) {
+ !new_transport->seqpacket_allow(vsk, remote_cid)) {
module_put(new_transport->module);
return -ESOCKTNOSUPPORT;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 163ddfc0808529ad6dda7992f9ec48837dd7337c..60bf3f1f39c51d44768fd2f04df3abee9f966252 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -536,7 +536,7 @@ static bool virtio_transport_msgzerocopy_allow(void)
return true;
}
-static bool virtio_transport_seqpacket_allow(u32 remote_cid);
+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
static struct virtio_transport virtio_transport = {
.transport = {
@@ -593,7 +593,7 @@ static struct virtio_transport virtio_transport = {
.can_msgzerocopy = virtio_transport_can_msgzerocopy,
};
-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
struct virtio_vsock *vsock;
bool seqpacket_allow;
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 6e78927a598e07cf77386a420b9d05d3f491dc7c..1b2fab73e0d0a6c63ed60d29fc837da58f6fb121 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
return 0;
}
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
@@ -106,7 +106,7 @@ static struct virtio_transport loopback_transport = {
.send_pkt = vsock_loopback_send_pkt,
};
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
return true;
}
--
2.47.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-12 20:59 ` [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device Bobby Eshleman
@ 2025-03-19 14:15 ` Stefano Garzarella
2025-03-19 19:28 ` Bobby Eshleman
2025-03-19 21:09 ` Paolo Abeni
1 sibling, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-19 14:15 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 12, 2025 at 01:59:37PM -0700, Bobby Eshleman wrote:
>From: Stefano Garzarella <sgarzare@redhat.com>
>
>This patch assigns the network namespace of the process that opened
>vhost-vsock-netns device (e.g. VMM) to the packets coming from the
>guest, allowing only host sockets in the same network namespace to
>communicate with the guest.
>
>This patch also allows having different VMs, running in different
>network namespace, with the same CID/port.
>
>This patch brings namespace support only to vhost-vsock, but not
>to virtio-vsock, hyperv, or vmci.
>
>Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Please describe the new behaviour you described in the cover letter
about CID connected, etc.
>Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>---
>v1 -> v2
> * removed 'netns' module param
> * renamed vsock_default_net() to vsock_global_net() to reflect new
> semantics
> * reserved NULL net to indicate "global" vsock namespace
>
>RFC -> v1
>* added 'netns' module param
>* added 'vsock_net_eq()' to check the "net" assigned to a socket
> only when 'netns' support is enabled
>---
> drivers/vhost/vsock.c | 97 +++++++++++++++++++++++++++++++++-------
> include/linux/miscdevice.h | 1 +
> include/net/af_vsock.h | 3 +-
> net/vmw_vsock/af_vsock.c | 30 ++++++++++++-
> net/vmw_vsock/virtio_transport.c | 4 +-
> net/vmw_vsock/vsock_loopback.c | 4 +-
> 6 files changed, 117 insertions(+), 22 deletions(-)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 02e2a3551205a4398a74a167a82802d950c962f6..8702beb8238aa290b6d901e53c36481637840017 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -46,6 +46,7 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
> struct vhost_vsock {
> struct vhost_dev dev;
> struct vhost_virtqueue vqs[2];
>+ struct net *net;
>
> /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
> struct hlist_node hash;
>@@ -67,8 +68,9 @@ static u32 vhost_transport_get_local_cid(void)
> /* Callers that dereference the return value must hold vhost_vsock_mutex or the
> * RCU read lock.
> */
>-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net, bool global_fallback)
> {
>+ struct vhost_vsock *fallback = NULL;
> struct vhost_vsock *vsock;
>
> hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
>@@ -78,11 +80,18 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> if (other_cid == 0)
> continue;
>
>- if (other_cid == guest_cid)
>- return vsock;
>+ if (other_cid == guest_cid) {
>+ if (net_eq(net, vsock->net))
>+ return vsock;
>
>+ if (net_eq(vsock->net, vsock_global_net()))
>+ fallback = vsock;
I'd like to reuse the same function that I suggested in patch 1, but I
understand that we return different things here, so we either do a macro
or using `void *`, but I would like this logic to be centralized
somewhere and reusable in the core and transports if it's possible.
>+ }
> }
>
>+ if (global_fallback)
>+ return fallback;
>+
> return NULL;
> }
>
>@@ -272,13 +281,14 @@ static int
> vhost_transport_send_pkt(struct sk_buff *skb)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
>+ struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
> struct vhost_vsock *vsock;
> int len = skb->len;
>
> rcu_read_lock();
>
> /* Find the vhost_vsock according to guest context id */
>- vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
>+ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net, true);
> if (!vsock) {
> rcu_read_unlock();
> kfree_skb(skb);
>@@ -305,7 +315,8 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
> rcu_read_lock();
>
> /* Find the vhost_vsock according to guest context id */
>- vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
>+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,)
>+ sock_net(sk_vsock(vsk)), true);
> if (!vsock)
> goto out;
>
>@@ -403,7 +414,7 @@ static bool vhost_transport_msgzerocopy_allow(void)
> return true;
> }
>
>-static bool vhost_transport_seqpacket_allow(u32 remote_cid);
>+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
>
> static struct virtio_transport vhost_transport = {
> .transport = {
>@@ -459,13 +470,14 @@ static struct virtio_transport vhost_transport = {
> .send_pkt = vhost_transport_send_pkt,
> };
>
>-static bool vhost_transport_seqpacket_allow(u32 remote_cid)
>+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> {
>+ struct net *net = sock_net(sk_vsock(vsk));
> struct vhost_vsock *vsock;
> bool seqpacket_allow = false;
>
> rcu_read_lock();
>- vsock = vhost_vsock_get(remote_cid);
>+ vsock = vhost_vsock_get(remote_cid, net, true);
>
> if (vsock)
> seqpacket_allow = vsock->seqpacket_allow;
>@@ -525,7 +537,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> continue;
> }
>
>- VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
>+ VIRTIO_VSOCK_SKB_CB(skb)->net = vsock->net;
> total_len += sizeof(*hdr) + skb->len;
>
> /* Deliver to monitoring devices all received packets */
>@@ -650,7 +662,7 @@ static void vhost_vsock_free(struct vhost_vsock *vsock)
> kvfree(vsock);
> }
>
>-static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
>+static int __vhost_vsock_dev_open(struct inode *inode, struct file *file, struct net *net)
> {
> struct vhost_virtqueue **vqs;
> struct vhost_vsock *vsock;
>@@ -669,6 +681,8 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> goto out;
> }
>
>+ vsock->net = net;
>+
> vsock->guest_cid = 0; /* no CID assigned yet */
> vsock->seqpacket_allow = false;
>
>@@ -693,6 +707,22 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> return ret;
> }
>
>+static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
>+{
>+ return __vhost_vsock_dev_open(inode, file, vsock_global_net());
>+}
>+
>+static int vhost_vsock_netns_dev_open(struct inode *inode, struct file *file)
>+{
>+ struct net *net;
>+
>+ net = get_net_ns_by_pid(current->pid);
>+ if (IS_ERR(net))
>+ return PTR_ERR(net);
>+
>+ return __vhost_vsock_dev_open(inode, file, net);
>+}
>+
> static void vhost_vsock_flush(struct vhost_vsock *vsock)
> {
> vhost_dev_flush(&vsock->dev);
>@@ -708,7 +738,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
> */
>
> /* If the peer is still valid, no need to reset connection */
>- if (vhost_vsock_get(vsk->remote_addr.svm_cid))
>+ if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk), true))
> return;
>
> /* If the close timeout is pending, let it expire. This avoids races
>@@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>
> vhost_dev_cleanup(&vsock->dev);
>+ if (vsock->net)
>+ put_net(vsock->net);
> kfree(vsock->dev.vqs);
> vhost_vsock_free(vsock);
> return 0;
>@@ -777,9 +809,15 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
> if (vsock_find_cid(guest_cid))
> return -EADDRINUSE;
>
>+ /* If this namespace has already connected to this CID, then report
>+ * that this address is already in use.
Why? (I mean a comment should explain more the reason that what we do)
>+ */
>+ if (vsock->net && vsock_net_has_connected(vsock->net, guest_cid))
>+ return -EADDRINUSE;
Why only if `vsock->net` is not null?
Also, if we have a function to assign NULL to `vsock->net` because for
us it's a special meaning, I think we should also have a function to
check it if it's a global netns. I mean or we add 2 functions to set it
and to check it, or none.
>+
> /* Refuse if CID is already in use */
> mutex_lock(&vhost_vsock_mutex);
>- other = vhost_vsock_get(guest_cid);
>+ other = vhost_vsock_get(guest_cid, vsock->net, false);
> if (other && other != vsock) {
> mutex_unlock(&vhost_vsock_mutex);
> return -EADDRINUSE;
>@@ -931,6 +969,24 @@ static struct miscdevice vhost_vsock_misc = {
> .fops = &vhost_vsock_fops,
> };
>
>+static const struct file_operations vhost_vsock_netns_fops = {
>+ .owner = THIS_MODULE,
>+ .open = vhost_vsock_netns_dev_open,
>+ .release = vhost_vsock_dev_release,
>+ .llseek = noop_llseek,
>+ .unlocked_ioctl = vhost_vsock_dev_ioctl,
>+ .compat_ioctl = compat_ptr_ioctl,
>+ .read_iter = vhost_vsock_chr_read_iter,
>+ .write_iter = vhost_vsock_chr_write_iter,
>+ .poll = vhost_vsock_chr_poll,
>+};
>+
>+static struct miscdevice vhost_vsock_netns_misc = {
>+ .minor = VHOST_VSOCK_NETNS_MINOR,
>+ .name = "vhost-vsock-netns",
>+ .fops = &vhost_vsock_netns_fops,
>+};
>+
> static int __init vhost_vsock_init(void)
> {
> int ret;
>@@ -941,17 +997,26 @@ static int __init vhost_vsock_init(void)
> return ret;
>
> ret = misc_register(&vhost_vsock_misc);
>- if (ret) {
>- vsock_core_unregister(&vhost_transport.transport);
>- return ret;
>- }
>+ if (ret)
>+ goto out_vt;
>+
>+ ret = misc_register(&vhost_vsock_netns_misc);
>+ if (ret)
>+ goto out_vvm;
>
> return 0;
>+
>+out_vvm:
out_misc:
>+ misc_deregister(&vhost_vsock_misc);
>+out_vt:
out_core:
>+ vsock_core_unregister(&vhost_transport.transport);
>+ return ret;
> };
>
> static void __exit vhost_vsock_exit(void)
> {
> misc_deregister(&vhost_vsock_misc);
>+ misc_deregister(&vhost_vsock_netns_misc);
nit: I'd do the reverse order of vhost_vsock_init(), so moving this
new line at the top.
> vsock_core_unregister(&vhost_transport.transport);
> };
>
>diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
>index 69e110c2b86a9b16c1637778a25e1eebb3fd0111..a7e11b70a5398a225c4d63d50ac460e6388e022c 100644
>--- a/include/linux/miscdevice.h
>+++ b/include/linux/miscdevice.h
>@@ -71,6 +71,7 @@
> #define USERIO_MINOR 240
> #define VHOST_VSOCK_MINOR 241
> #define RFKILL_MINOR 242
>+#define VHOST_VSOCK_NETNS_MINOR 243
> #define MISC_DYNAMIC_MINOR 255
>
> struct device;
>diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>index 41afbc18648c953da27a93571d408de968aa7668..1e37737689a741d91e64b8c0ed0a931fc7376194 100644
>--- a/include/net/af_vsock.h
>+++ b/include/net/af_vsock.h
>@@ -143,7 +143,7 @@ struct vsock_transport {
> int flags);
> int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
> size_t len);
>- bool (*seqpacket_allow)(u32 remote_cid);
>+ bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
I'd do this change + transports changes in a separate patch.
> u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
>
> /* Notification. */
>@@ -258,4 +258,5 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
> }
>
> struct net *vsock_global_net(void);
>+bool vsock_net_has_connected(struct net *net, u64 guest_cid);
> #endif /* __AF_VSOCK_H__ */
>diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>index d206489bf0a81cf989387c7c8063be91a7c21a7d..58fa415555d6aae5043b5ca2bfc4783af566cf28 100644
>--- a/net/vmw_vsock/af_vsock.c
>+++ b/net/vmw_vsock/af_vsock.c
>@@ -391,6 +391,34 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport,
> }
> EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
>
>+bool vsock_net_has_connected(struct net *net, u64 guest_cid)
>+{
>+ bool ret = false;
>+ int i;
>+
>+ spin_lock_bh(&vsock_table_lock);
>+
>+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
>+ struct vsock_sock *vsk;
>+
>+ list_for_each_entry(vsk, &vsock_connected_table[i],
>+ connected_table) {
>+ if (sock_net(sk_vsock(vsk)) != net)
>+ continue;
>+
>+ if (vsk->remote_addr.svm_cid == guest_cid) {
>+ ret = true;
>+ goto out;
>+ }
>+ }
>+ }
>+
>+out:
>+ spin_unlock_bh(&vsock_table_lock);
>+ return ret;
>+}
>+EXPORT_SYMBOL_GPL(vsock_net_has_connected);
>+
> void vsock_add_pending(struct sock *listener, struct sock *pending)
> {
> struct vsock_sock *vlistener;
>@@ -537,7 +565,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
>
> if (sk->sk_type == SOCK_SEQPACKET) {
> if (!new_transport->seqpacket_allow ||
>- !new_transport->seqpacket_allow(remote_cid)) {
>+ !new_transport->seqpacket_allow(vsk, remote_cid)) {
> module_put(new_transport->module);
> return -ESOCKTNOSUPPORT;
> }
>diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>index 163ddfc0808529ad6dda7992f9ec48837dd7337c..60bf3f1f39c51d44768fd2f04df3abee9f966252 100644
>--- a/net/vmw_vsock/virtio_transport.c
>+++ b/net/vmw_vsock/virtio_transport.c
>@@ -536,7 +536,7 @@ static bool virtio_transport_msgzerocopy_allow(void)
> return true;
> }
>
>-static bool virtio_transport_seqpacket_allow(u32 remote_cid);
>+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
>
> static struct virtio_transport virtio_transport = {
> .transport = {
>@@ -593,7 +593,7 @@ static struct virtio_transport virtio_transport = {
> .can_msgzerocopy = virtio_transport_can_msgzerocopy,
> };
>
>-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
>+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> {
> struct virtio_vsock *vsock;
> bool seqpacket_allow;
>diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
>index 6e78927a598e07cf77386a420b9d05d3f491dc7c..1b2fab73e0d0a6c63ed60d29fc837da58f6fb121 100644
>--- a/net/vmw_vsock/vsock_loopback.c
>+++ b/net/vmw_vsock/vsock_loopback.c
>@@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
> return 0;
> }
>
>-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
>+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
> static bool vsock_loopback_msgzerocopy_allow(void)
> {
> return true;
>@@ -106,7 +106,7 @@ static struct virtio_transport loopback_transport = {
> .send_pkt = vsock_loopback_send_pkt,
> };
>
>-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
>+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> {
> return true;
> }
>
>--
>2.47.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-19 14:15 ` Stefano Garzarella
@ 2025-03-19 19:28 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-19 19:28 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 19, 2025 at 03:15:36PM +0100, Stefano Garzarella wrote:
> On Wed, Mar 12, 2025 at 01:59:37PM -0700, Bobby Eshleman wrote:
> > From: Stefano Garzarella <sgarzare@redhat.com>
> >
> > This patch assigns the network namespace of the process that opened
> > vhost-vsock-netns device (e.g. VMM) to the packets coming from the
> > guest, allowing only host sockets in the same network namespace to
> > communicate with the guest.
> >
> > This patch also allows having different VMs, running in different
> > network namespace, with the same CID/port.
> >
> > This patch brings namespace support only to vhost-vsock, but not
> > to virtio-vsock, hyperv, or vmci.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>
> Please describe the new behaviour you described in the cover letter
> about CID connected, etc.
>
will do!
> > Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
> > ---
> > v1 -> v2
> > * removed 'netns' module param
> > * renamed vsock_default_net() to vsock_global_net() to reflect new
> > semantics
> > * reserved NULL net to indicate "global" vsock namespace
> >
> > RFC -> v1
> > * added 'netns' module param
> > * added 'vsock_net_eq()' to check the "net" assigned to a socket
> > only when 'netns' support is enabled
> > ---
> > drivers/vhost/vsock.c | 97 +++++++++++++++++++++++++++++++++-------
> > include/linux/miscdevice.h | 1 +
> > include/net/af_vsock.h | 3 +-
> > net/vmw_vsock/af_vsock.c | 30 ++++++++++++-
> > net/vmw_vsock/virtio_transport.c | 4 +-
> > net/vmw_vsock/vsock_loopback.c | 4 +-
> > 6 files changed, 117 insertions(+), 22 deletions(-)
> >
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index 02e2a3551205a4398a74a167a82802d950c962f6..8702beb8238aa290b6d901e53c36481637840017 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -46,6 +46,7 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
> > struct vhost_vsock {
> > struct vhost_dev dev;
> > struct vhost_virtqueue vqs[2];
> > + struct net *net;
> >
> > /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
> > struct hlist_node hash;
> > @@ -67,8 +68,9 @@ static u32 vhost_transport_get_local_cid(void)
> > /* Callers that dereference the return value must hold vhost_vsock_mutex or the
> > * RCU read lock.
> > */
> > -static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> > +static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net, bool global_fallback)
> > {
> > + struct vhost_vsock *fallback = NULL;
> > struct vhost_vsock *vsock;
> >
> > hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
> > @@ -78,11 +80,18 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> > if (other_cid == 0)
> > continue;
> >
> > - if (other_cid == guest_cid)
> > - return vsock;
> > + if (other_cid == guest_cid) {
> > + if (net_eq(net, vsock->net))
> > + return vsock;
> >
> > + if (net_eq(vsock->net, vsock_global_net()))
> > + fallback = vsock;
>
> I'd like to reuse the same function that I suggested in patch 1, but I
> understand that we return different things here, so we either do a macro or
> using `void *`, but I would like this logic to be centralized somewhere and
> reusable in the core and transports if it's possible.
>
sgtm! I'll play with both options and see what comes out easiest to
follow.
> > + }
> > }
> >
> > + if (global_fallback)
> > + return fallback;
> > +
> > return NULL;
> > }
> >
> > @@ -272,13 +281,14 @@ static int
> > vhost_transport_send_pkt(struct sk_buff *skb)
> > {
> > struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> > + struct net *net = VIRTIO_VSOCK_SKB_CB(skb)->net;
> > struct vhost_vsock *vsock;
> > int len = skb->len;
> >
> > rcu_read_lock();
> >
> > /* Find the vhost_vsock according to guest context id */
> > - vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
> > + vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net, true);
> > if (!vsock) {
> > rcu_read_unlock();
> > kfree_skb(skb);
> > @@ -305,7 +315,8 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
> > rcu_read_lock();
> >
> > /* Find the vhost_vsock according to guest context id */
> > - vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
> > + vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,)
> > + sock_net(sk_vsock(vsk)), true);
> > if (!vsock)
> > goto out;
> >
> > @@ -403,7 +414,7 @@ static bool vhost_transport_msgzerocopy_allow(void)
> > return true;
> > }
> >
> > -static bool vhost_transport_seqpacket_allow(u32 remote_cid);
> > +static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
> >
> > static struct virtio_transport vhost_transport = {
> > .transport = {
> > @@ -459,13 +470,14 @@ static struct virtio_transport vhost_transport = {
> > .send_pkt = vhost_transport_send_pkt,
> > };
> >
> > -static bool vhost_transport_seqpacket_allow(u32 remote_cid)
> > +static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> > {
> > + struct net *net = sock_net(sk_vsock(vsk));
> > struct vhost_vsock *vsock;
> > bool seqpacket_allow = false;
> >
> > rcu_read_lock();
> > - vsock = vhost_vsock_get(remote_cid);
> > + vsock = vhost_vsock_get(remote_cid, net, true);
> >
> > if (vsock)
> > seqpacket_allow = vsock->seqpacket_allow;
> > @@ -525,7 +537,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> > continue;
> > }
> >
> > - VIRTIO_VSOCK_SKB_CB(skb)->net = vsock_global_net();
> > + VIRTIO_VSOCK_SKB_CB(skb)->net = vsock->net;
> > total_len += sizeof(*hdr) + skb->len;
> >
> > /* Deliver to monitoring devices all received packets */
> > @@ -650,7 +662,7 @@ static void vhost_vsock_free(struct vhost_vsock *vsock)
> > kvfree(vsock);
> > }
> >
> > -static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> > +static int __vhost_vsock_dev_open(struct inode *inode, struct file *file, struct net *net)
> > {
> > struct vhost_virtqueue **vqs;
> > struct vhost_vsock *vsock;
> > @@ -669,6 +681,8 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> > goto out;
> > }
> >
> > + vsock->net = net;
> > +
> > vsock->guest_cid = 0; /* no CID assigned yet */
> > vsock->seqpacket_allow = false;
> >
> > @@ -693,6 +707,22 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> > return ret;
> > }
> >
> > +static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> > +{
> > + return __vhost_vsock_dev_open(inode, file, vsock_global_net());
> > +}
> > +
> > +static int vhost_vsock_netns_dev_open(struct inode *inode, struct file *file)
> > +{
> > + struct net *net;
> > +
> > + net = get_net_ns_by_pid(current->pid);
> > + if (IS_ERR(net))
> > + return PTR_ERR(net);
> > +
> > + return __vhost_vsock_dev_open(inode, file, net);
> > +}
> > +
> > static void vhost_vsock_flush(struct vhost_vsock *vsock)
> > {
> > vhost_dev_flush(&vsock->dev);
> > @@ -708,7 +738,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
> > */
> >
> > /* If the peer is still valid, no need to reset connection */
> > - if (vhost_vsock_get(vsk->remote_addr.svm_cid))
> > + if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk), true))
> > return;
> >
> > /* If the close timeout is pending, let it expire. This avoids races
> > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> >
> > vhost_dev_cleanup(&vsock->dev);
> > + if (vsock->net)
> > + put_net(vsock->net);
> > kfree(vsock->dev.vqs);
> > vhost_vsock_free(vsock);
> > return 0;
> > @@ -777,9 +809,15 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
> > if (vsock_find_cid(guest_cid))
> > return -EADDRINUSE;
> >
> > + /* If this namespace has already connected to this CID, then report
> > + * that this address is already in use.
>
> Why? (I mean a comment should explain more the reason that what we do)
>
sounds good!
> > + */
> > + if (vsock->net && vsock_net_has_connected(vsock->net, guest_cid))
> > + return -EADDRINUSE;
>
> Why only if `vsock->net` is not null?
>
It can be removed. I added this in hopes to call out that the check
isn't strictly necessary for the global ns case, as the check below for
duplicate CID will catch that case too (where as, for non-global, the
CID check will pass since the connected CID is in another ns).
> Also, if we have a function to assign NULL to `vsock->net` because for us
> it's a special meaning, I think we should also have a function to check it
> if it's a global netns. I mean or we add 2 functions to set it and to check
> it, or none.
>
sgtm!
> > +
> > /* Refuse if CID is already in use */
> > mutex_lock(&vhost_vsock_mutex);
> > - other = vhost_vsock_get(guest_cid);
> > + other = vhost_vsock_get(guest_cid, vsock->net, false);
> > if (other && other != vsock) {
> > mutex_unlock(&vhost_vsock_mutex);
> > return -EADDRINUSE;
> > @@ -931,6 +969,24 @@ static struct miscdevice vhost_vsock_misc = {
> > .fops = &vhost_vsock_fops,
> > };
> >
> > +static const struct file_operations vhost_vsock_netns_fops = {
> > + .owner = THIS_MODULE,
> > + .open = vhost_vsock_netns_dev_open,
> > + .release = vhost_vsock_dev_release,
> > + .llseek = noop_llseek,
> > + .unlocked_ioctl = vhost_vsock_dev_ioctl,
> > + .compat_ioctl = compat_ptr_ioctl,
> > + .read_iter = vhost_vsock_chr_read_iter,
> > + .write_iter = vhost_vsock_chr_write_iter,
> > + .poll = vhost_vsock_chr_poll,
> > +};
> > +
> > +static struct miscdevice vhost_vsock_netns_misc = {
> > + .minor = VHOST_VSOCK_NETNS_MINOR,
> > + .name = "vhost-vsock-netns",
> > + .fops = &vhost_vsock_netns_fops,
> > +};
> > +
> > static int __init vhost_vsock_init(void)
> > {
> > int ret;
> > @@ -941,17 +997,26 @@ static int __init vhost_vsock_init(void)
> > return ret;
> >
> > ret = misc_register(&vhost_vsock_misc);
> > - if (ret) {
> > - vsock_core_unregister(&vhost_transport.transport);
> > - return ret;
> > - }
> > + if (ret)
> > + goto out_vt;
> > +
> > + ret = misc_register(&vhost_vsock_netns_misc);
> > + if (ret)
> > + goto out_vvm;
> >
> > return 0;
> > +
> > +out_vvm:
>
> out_misc:
>
> > + misc_deregister(&vhost_vsock_misc);
> > +out_vt:
>
> out_core:
>
> > + vsock_core_unregister(&vhost_transport.transport);
> > + return ret;
> > };
> >
> > static void __exit vhost_vsock_exit(void)
> > {
> > misc_deregister(&vhost_vsock_misc);
> > + misc_deregister(&vhost_vsock_netns_misc);
>
> nit: I'd do the reverse order of vhost_vsock_init(), so moving this
> new line at the top.
>
got it, will do!
> > vsock_core_unregister(&vhost_transport.transport);
> > };
> >
> > diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
> > index 69e110c2b86a9b16c1637778a25e1eebb3fd0111..a7e11b70a5398a225c4d63d50ac460e6388e022c 100644
> > --- a/include/linux/miscdevice.h
> > +++ b/include/linux/miscdevice.h
> > @@ -71,6 +71,7 @@
> > #define USERIO_MINOR 240
> > #define VHOST_VSOCK_MINOR 241
> > #define RFKILL_MINOR 242
> > +#define VHOST_VSOCK_NETNS_MINOR 243
> > #define MISC_DYNAMIC_MINOR 255
> >
> > struct device;
> > diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> > index 41afbc18648c953da27a93571d408de968aa7668..1e37737689a741d91e64b8c0ed0a931fc7376194 100644
> > --- a/include/net/af_vsock.h
> > +++ b/include/net/af_vsock.h
> > @@ -143,7 +143,7 @@ struct vsock_transport {
> > int flags);
> > int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
> > size_t len);
> > - bool (*seqpacket_allow)(u32 remote_cid);
> > + bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
>
> I'd do this change + transports changes in a separate patch.
>
sounds good!
> > u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
> >
> > /* Notification. */
> > @@ -258,4 +258,5 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
> > }
> >
> > struct net *vsock_global_net(void);
> > +bool vsock_net_has_connected(struct net *net, u64 guest_cid);
> > #endif /* __AF_VSOCK_H__ */
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index d206489bf0a81cf989387c7c8063be91a7c21a7d..58fa415555d6aae5043b5ca2bfc4783af566cf28 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> > @@ -391,6 +391,34 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport,
> > }
> > EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
> >
> > +bool vsock_net_has_connected(struct net *net, u64 guest_cid)
> > +{
> > + bool ret = false;
> > + int i;
> > +
> > + spin_lock_bh(&vsock_table_lock);
> > +
> > + for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
> > + struct vsock_sock *vsk;
> > +
> > + list_for_each_entry(vsk, &vsock_connected_table[i],
> > + connected_table) {
> > + if (sock_net(sk_vsock(vsk)) != net)
> > + continue;
> > +
> > + if (vsk->remote_addr.svm_cid == guest_cid) {
> > + ret = true;
> > + goto out;
> > + }
> > + }
> > + }
> > +
> > +out:
> > + spin_unlock_bh(&vsock_table_lock);
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vsock_net_has_connected);
> > +
> > void vsock_add_pending(struct sock *listener, struct sock *pending)
> > {
> > struct vsock_sock *vlistener;
> > @@ -537,7 +565,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
> >
> > if (sk->sk_type == SOCK_SEQPACKET) {
> > if (!new_transport->seqpacket_allow ||
> > - !new_transport->seqpacket_allow(remote_cid)) {
> > + !new_transport->seqpacket_allow(vsk, remote_cid)) {
> > module_put(new_transport->module);
> > return -ESOCKTNOSUPPORT;
> > }
> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > index 163ddfc0808529ad6dda7992f9ec48837dd7337c..60bf3f1f39c51d44768fd2f04df3abee9f966252 100644
> > --- a/net/vmw_vsock/virtio_transport.c
> > +++ b/net/vmw_vsock/virtio_transport.c
> > @@ -536,7 +536,7 @@ static bool virtio_transport_msgzerocopy_allow(void)
> > return true;
> > }
> >
> > -static bool virtio_transport_seqpacket_allow(u32 remote_cid);
> > +static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
> >
> > static struct virtio_transport virtio_transport = {
> > .transport = {
> > @@ -593,7 +593,7 @@ static struct virtio_transport virtio_transport = {
> > .can_msgzerocopy = virtio_transport_can_msgzerocopy,
> > };
> >
> > -static bool virtio_transport_seqpacket_allow(u32 remote_cid)
> > +static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> > {
> > struct virtio_vsock *vsock;
> > bool seqpacket_allow;
> > diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
> > index 6e78927a598e07cf77386a420b9d05d3f491dc7c..1b2fab73e0d0a6c63ed60d29fc837da58f6fb121 100644
> > --- a/net/vmw_vsock/vsock_loopback.c
> > +++ b/net/vmw_vsock/vsock_loopback.c
> > @@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
> > return 0;
> > }
> >
> > -static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
> > +static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid);
> > static bool vsock_loopback_msgzerocopy_allow(void)
> > {
> > return true;
> > @@ -106,7 +106,7 @@ static struct virtio_transport loopback_transport = {
> > .send_pkt = vsock_loopback_send_pkt,
> > };
> >
> > -static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
> > +static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> > {
> > return true;
> > }
> >
> > --
> > 2.47.1
> >
>
Thanks again for the review!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-12 20:59 ` [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device Bobby Eshleman
2025-03-19 14:15 ` Stefano Garzarella
@ 2025-03-19 21:09 ` Paolo Abeni
2025-03-20 9:08 ` Stefano Garzarella
2025-03-20 20:57 ` Bobby Eshleman
1 sibling, 2 replies; 42+ messages in thread
From: Paolo Abeni @ 2025-03-19 21:09 UTC (permalink / raw)
To: Bobby Eshleman, Stefano Garzarella, Jakub Kicinski,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang, Xuan Zhuo,
Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list
Cc: David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>
> vhost_dev_cleanup(&vsock->dev);
> + if (vsock->net)
> + put_net(vsock->net);
put_net() is a deprecated API, you should use put_net_track() instead.
> kfree(vsock->dev.vqs);
> vhost_vsock_free(vsock);
> return 0;
Also series introducing new features should also include the related
self-tests.
Thanks,
Paolo
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-19 21:09 ` Paolo Abeni
@ 2025-03-20 9:08 ` Stefano Garzarella
2025-03-20 21:05 ` Bobby Eshleman
2025-03-20 20:57 ` Bobby Eshleman
1 sibling, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-20 9:08 UTC (permalink / raw)
To: Paolo Abeni
Cc: Bobby Eshleman, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
>On 3/12/25 9:59 PM, Bobby Eshleman wrote:
>> @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>> virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>>
>> vhost_dev_cleanup(&vsock->dev);
>> + if (vsock->net)
>> + put_net(vsock->net);
>
>put_net() is a deprecated API, you should use put_net_track() instead.
>
>> kfree(vsock->dev.vqs);
>> vhost_vsock_free(vsock);
>> return 0;
>
>Also series introducing new features should also include the related
>self-tests.
Yes, I was thinking about testing as well, but to test this I think we
need to run QEMU with Linux in it, is this feasible in self-tests?
We should start looking at that, because for now I have my own ansible
script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
test both host (vhost-vsock) and guest (virtio-vsock).
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-20 9:08 ` Stefano Garzarella
@ 2025-03-20 21:05 ` Bobby Eshleman
2025-03-21 10:02 ` Stefano Garzarella
0 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-20 21:05 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
> On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> > >
> > > vhost_dev_cleanup(&vsock->dev);
> > > + if (vsock->net)
> > > + put_net(vsock->net);
> >
> > put_net() is a deprecated API, you should use put_net_track() instead.
> >
> > > kfree(vsock->dev.vqs);
> > > vhost_vsock_free(vsock);
> > > return 0;
> >
> > Also series introducing new features should also include the related
> > self-tests.
>
> Yes, I was thinking about testing as well, but to test this I think we need
> to run QEMU with Linux in it, is this feasible in self-tests?
>
> We should start looking at that, because for now I have my own ansible
> script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
> test both host (vhost-vsock) and guest (virtio-vsock).
>
Maybe as a baseline we could follow the model of
tools/testing/selftests/bpf/vmtest.sh and start by reusing your
vsock_test parameters from your Ansible script?
I don't mind writing the patches.
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-20 21:05 ` Bobby Eshleman
@ 2025-03-21 10:02 ` Stefano Garzarella
2025-03-21 16:43 ` Bobby Eshleman
2025-03-26 0:11 ` Bobby Eshleman
0 siblings, 2 replies; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-21 10:02 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
>On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
>> On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
>> > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
>> > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>> > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>> > >
>> > > vhost_dev_cleanup(&vsock->dev);
>> > > + if (vsock->net)
>> > > + put_net(vsock->net);
>> >
>> > put_net() is a deprecated API, you should use put_net_track() instead.
>> >
>> > > kfree(vsock->dev.vqs);
>> > > vhost_vsock_free(vsock);
>> > > return 0;
>> >
>> > Also series introducing new features should also include the related
>> > self-tests.
>>
>> Yes, I was thinking about testing as well, but to test this I think we need
>> to run QEMU with Linux in it, is this feasible in self-tests?
>>
>> We should start looking at that, because for now I have my own ansible
>> script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
>> test both host (vhost-vsock) and guest (virtio-vsock).
>>
>
>Maybe as a baseline we could follow the model of
>tools/testing/selftests/bpf/vmtest.sh and start by reusing your
>vsock_test parameters from your Ansible script?
Yeah, my playbooks are here:
https://github.com/stefano-garzarella/ansible-vsock
Note: they are heavily customized on my env, I wrote some notes on how
to change various wired path.
>
>I don't mind writing the patches.
That would be great and very much appreciated.
Maybe you can do it in a separate series and then here add just the
configuration we need.
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-21 10:02 ` Stefano Garzarella
@ 2025-03-21 16:43 ` Bobby Eshleman
2025-03-26 0:11 ` Bobby Eshleman
1 sibling, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-21 16:43 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
> On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
> > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
> > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> > > > >
> > > > > vhost_dev_cleanup(&vsock->dev);
> > > > > + if (vsock->net)
> > > > > + put_net(vsock->net);
> > > >
> > > > put_net() is a deprecated API, you should use put_net_track() instead.
> > > >
> > > > > kfree(vsock->dev.vqs);
> > > > > vhost_vsock_free(vsock);
> > > > > return 0;
> > > >
> > > > Also series introducing new features should also include the related
> > > > self-tests.
> > >
> > > Yes, I was thinking about testing as well, but to test this I think we need
> > > to run QEMU with Linux in it, is this feasible in self-tests?
> > >
> > > We should start looking at that, because for now I have my own ansible
> > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
> > > test both host (vhost-vsock) and guest (virtio-vsock).
> > >
> >
> > Maybe as a baseline we could follow the model of
> > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
> > vsock_test parameters from your Ansible script?
>
> Yeah, my playbooks are here:
> https://github.com/stefano-garzarella/ansible-vsock
>
> Note: they are heavily customized on my env, I wrote some notes on how to
> change various wired path.
>
> >
> > I don't mind writing the patches.
>
> That would be great and very much appreciated.
> Maybe you can do it in a separate series and then here add just the
> configuration we need.
>
> Thanks,
> Stefano
>
Sounds like a plan. Thanks!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-21 10:02 ` Stefano Garzarella
2025-03-21 16:43 ` Bobby Eshleman
@ 2025-03-26 0:11 ` Bobby Eshleman
2025-03-27 9:14 ` Stefano Garzarella
1 sibling, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-26 0:11 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
> On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
> > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
> > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> > > > >
> > > > > vhost_dev_cleanup(&vsock->dev);
> > > > > + if (vsock->net)
> > > > > + put_net(vsock->net);
> > > >
> > > > put_net() is a deprecated API, you should use put_net_track() instead.
> > > >
> > > > > kfree(vsock->dev.vqs);
> > > > > vhost_vsock_free(vsock);
> > > > > return 0;
> > > >
> > > > Also series introducing new features should also include the related
> > > > self-tests.
> > >
> > > Yes, I was thinking about testing as well, but to test this I think we need
> > > to run QEMU with Linux in it, is this feasible in self-tests?
> > >
> > > We should start looking at that, because for now I have my own ansible
> > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
> > > test both host (vhost-vsock) and guest (virtio-vsock).
> > >
> >
> > Maybe as a baseline we could follow the model of
> > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
> > vsock_test parameters from your Ansible script?
>
> Yeah, my playbooks are here:
> https://github.com/stefano-garzarella/ansible-vsock
>
> Note: they are heavily customized on my env, I wrote some notes on how to
> change various wired path.
>
> >
> > I don't mind writing the patches.
>
> That would be great and very much appreciated.
> Maybe you can do it in a separate series and then here add just the
> configuration we need.
>
> Thanks,
> Stefano
>
Hey Stefano,
I noticed that bpf/vmtest.sh uses images hosted from libbpf's CI/CD. I
wonder if you have any thoughts on a good repo we may use to pull our
qcow image(s)? Or a preferred way to host some images, if no repo
exists?
Thanks,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-26 0:11 ` Bobby Eshleman
@ 2025-03-27 9:14 ` Stefano Garzarella
2025-03-28 16:07 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-27 9:14 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Tue, Mar 25, 2025 at 05:11:49PM -0700, Bobby Eshleman wrote:
>On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
>> On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
>> > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
>> > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
>> > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
>> > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>> > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>> > > > >
>> > > > > vhost_dev_cleanup(&vsock->dev);
>> > > > > + if (vsock->net)
>> > > > > + put_net(vsock->net);
>> > > >
>> > > > put_net() is a deprecated API, you should use put_net_track() instead.
>> > > >
>> > > > > kfree(vsock->dev.vqs);
>> > > > > vhost_vsock_free(vsock);
>> > > > > return 0;
>> > > >
>> > > > Also series introducing new features should also include the related
>> > > > self-tests.
>> > >
>> > > Yes, I was thinking about testing as well, but to test this I think we need
>> > > to run QEMU with Linux in it, is this feasible in self-tests?
>> > >
>> > > We should start looking at that, because for now I have my own ansible
>> > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
>> > > test both host (vhost-vsock) and guest (virtio-vsock).
>> > >
>> >
>> > Maybe as a baseline we could follow the model of
>> > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
>> > vsock_test parameters from your Ansible script?
>>
>> Yeah, my playbooks are here:
>> https://github.com/stefano-garzarella/ansible-vsock
>>
>> Note: they are heavily customized on my env, I wrote some notes on how to
>> change various wired path.
>>
>> >
>> > I don't mind writing the patches.
>>
>> That would be great and very much appreciated.
>> Maybe you can do it in a separate series and then here add just the
>> configuration we need.
>>
>> Thanks,
>> Stefano
>>
>
>Hey Stefano,
>
>I noticed that bpf/vmtest.sh uses images hosted from libbpf's CI/CD. I
>wonder if you have any thoughts on a good repo we may use to pull our
>qcow image(s)? Or a preferred way to host some images, if no repo
>exists?
Good question!
I created this group/repo mainily to keep trak of work, not sure if we
can reuse: https://gitlab.com/vsock/
I can add you there if you need to create new repo, etc.
But I'm also open to other solutions.
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-27 9:14 ` Stefano Garzarella
@ 2025-03-28 16:07 ` Bobby Eshleman
2025-03-28 16:19 ` Stefano Garzarella
0 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-28 16:07 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Thu, Mar 27, 2025 at 10:14:59AM +0100, Stefano Garzarella wrote:
> On Tue, Mar 25, 2025 at 05:11:49PM -0700, Bobby Eshleman wrote:
> > On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
> > > On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
> > > > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
> > > > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> > > > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > > > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > > > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> > > > > > >
> > > > > > > vhost_dev_cleanup(&vsock->dev);
> > > > > > > + if (vsock->net)
> > > > > > > + put_net(vsock->net);
> > > > > >
> > > > > > put_net() is a deprecated API, you should use put_net_track() instead.
> > > > > >
> > > > > > > kfree(vsock->dev.vqs);
> > > > > > > vhost_vsock_free(vsock);
> > > > > > > return 0;
> > > > > >
> > > > > > Also series introducing new features should also include the related
> > > > > > self-tests.
> > > > >
> > > > > Yes, I was thinking about testing as well, but to test this I think we need
> > > > > to run QEMU with Linux in it, is this feasible in self-tests?
> > > > >
> > > > > We should start looking at that, because for now I have my own ansible
> > > > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
> > > > > test both host (vhost-vsock) and guest (virtio-vsock).
> > > > >
> > > >
> > > > Maybe as a baseline we could follow the model of
> > > > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
> > > > vsock_test parameters from your Ansible script?
> > >
> > > Yeah, my playbooks are here:
> > > https://github.com/stefano-garzarella/ansible-vsock
> > >
> > > Note: they are heavily customized on my env, I wrote some notes on how to
> > > change various wired path.
> > >
> > > >
> > > > I don't mind writing the patches.
> > >
> > > That would be great and very much appreciated.
> > > Maybe you can do it in a separate series and then here add just the
> > > configuration we need.
> > >
> > > Thanks,
> > > Stefano
> > >
> >
> > Hey Stefano,
> >
> > I noticed that bpf/vmtest.sh uses images hosted from libbpf's CI/CD. I
> > wonder if you have any thoughts on a good repo we may use to pull our
> > qcow image(s)? Or a preferred way to host some images, if no repo
> > exists?
>
> Good question!
>
> I created this group/repo mainily to keep trak of work, not sure if we can
> reuse: https://gitlab.com/vsock/
>
> I can add you there if you need to create new repo, etc.
>
> But I'm also open to other solutions.
>
Sounds good to me. I also was considering using virtme-ng, which would
avoid the need, at the cost of the dependency. What are your thoughts on
that route?
Thanks,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-28 16:07 ` Bobby Eshleman
@ 2025-03-28 16:19 ` Stefano Garzarella
2025-03-28 20:14 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-28 16:19 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 28, 2025 at 09:07:22AM -0700, Bobby Eshleman wrote:
>On Thu, Mar 27, 2025 at 10:14:59AM +0100, Stefano Garzarella wrote:
>> On Tue, Mar 25, 2025 at 05:11:49PM -0700, Bobby Eshleman wrote:
>> > On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
>> > > On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
>> > > > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
>> > > > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
>> > > > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
>> > > > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>> > > > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>> > > > > > >
>> > > > > > > vhost_dev_cleanup(&vsock->dev);
>> > > > > > > + if (vsock->net)
>> > > > > > > + put_net(vsock->net);
>> > > > > >
>> > > > > > put_net() is a deprecated API, you should use put_net_track() instead.
>> > > > > >
>> > > > > > > kfree(vsock->dev.vqs);
>> > > > > > > vhost_vsock_free(vsock);
>> > > > > > > return 0;
>> > > > > >
>> > > > > > Also series introducing new features should also include the related
>> > > > > > self-tests.
>> > > > >
>> > > > > Yes, I was thinking about testing as well, but to test this I think we need
>> > > > > to run QEMU with Linux in it, is this feasible in self-tests?
>> > > > >
>> > > > > We should start looking at that, because for now I have my own ansible
>> > > > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
>> > > > > test both host (vhost-vsock) and guest (virtio-vsock).
>> > > > >
>> > > >
>> > > > Maybe as a baseline we could follow the model of
>> > > > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
>> > > > vsock_test parameters from your Ansible script?
>> > >
>> > > Yeah, my playbooks are here:
>> > > https://github.com/stefano-garzarella/ansible-vsock
>> > >
>> > > Note: they are heavily customized on my env, I wrote some notes on how to
>> > > change various wired path.
>> > >
>> > > >
>> > > > I don't mind writing the patches.
>> > >
>> > > That would be great and very much appreciated.
>> > > Maybe you can do it in a separate series and then here add just the
>> > > configuration we need.
>> > >
>> > > Thanks,
>> > > Stefano
>> > >
>> >
>> > Hey Stefano,
>> >
>> > I noticed that bpf/vmtest.sh uses images hosted from libbpf's CI/CD. I
>> > wonder if you have any thoughts on a good repo we may use to pull our
>> > qcow image(s)? Or a preferred way to host some images, if no repo
>> > exists?
>>
>> Good question!
>>
>> I created this group/repo mainily to keep trak of work, not sure if we can
>> reuse: https://gitlab.com/vsock/
>>
>> I can add you there if you need to create new repo, etc.
>>
>> But I'm also open to other solutions.
>>
>
>Sounds good to me. I also was considering using virtme-ng, which would
>avoid the need, at the cost of the dependency. What are your thoughts on
>that route?
I just saw that Paolo had proposed the same, but his response was
off-list by mistake!
So I would say it is an explorable path. I have no experience with it,
but it looks like it could do the job!
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-28 16:19 ` Stefano Garzarella
@ 2025-03-28 20:14 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-28 20:14 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Paolo Abeni, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 28, 2025 at 05:19:44PM +0100, Stefano Garzarella wrote:
> On Fri, Mar 28, 2025 at 09:07:22AM -0700, Bobby Eshleman wrote:
> > On Thu, Mar 27, 2025 at 10:14:59AM +0100, Stefano Garzarella wrote:
> > > On Tue, Mar 25, 2025 at 05:11:49PM -0700, Bobby Eshleman wrote:
> > > > On Fri, Mar 21, 2025 at 11:02:34AM +0100, Stefano Garzarella wrote:
> > > > > On Thu, Mar 20, 2025 at 02:05:38PM -0700, Bobby Eshleman wrote:
> > > > > > On Thu, Mar 20, 2025 at 10:08:02AM +0100, Stefano Garzarella wrote:
> > > > > > > On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> > > > > > > > On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > > > > > > > > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > > > > > > > > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> > > > > > > > >
> > > > > > > > > vhost_dev_cleanup(&vsock->dev);
> > > > > > > > > + if (vsock->net)
> > > > > > > > > + put_net(vsock->net);
> > > > > > > >
> > > > > > > > put_net() is a deprecated API, you should use put_net_track() instead.
> > > > > > > >
> > > > > > > > > kfree(vsock->dev.vqs);
> > > > > > > > > vhost_vsock_free(vsock);
> > > > > > > > > return 0;
> > > > > > > >
> > > > > > > > Also series introducing new features should also include the related
> > > > > > > > self-tests.
> > > > > > >
> > > > > > > Yes, I was thinking about testing as well, but to test this I think we need
> > > > > > > to run QEMU with Linux in it, is this feasible in self-tests?
> > > > > > >
> > > > > > > We should start looking at that, because for now I have my own ansible
> > > > > > > script that runs tests (tools/testing/vsock/vsock_test) in nested VMs to
> > > > > > > test both host (vhost-vsock) and guest (virtio-vsock).
> > > > > > >
> > > > > >
> > > > > > Maybe as a baseline we could follow the model of
> > > > > > tools/testing/selftests/bpf/vmtest.sh and start by reusing your
> > > > > > vsock_test parameters from your Ansible script?
> > > > >
> > > > > Yeah, my playbooks are here:
> > > > > https://github.com/stefano-garzarella/ansible-vsock
> > > > >
> > > > > Note: they are heavily customized on my env, I wrote some notes on how to
> > > > > change various wired path.
> > > > >
> > > > > >
> > > > > > I don't mind writing the patches.
> > > > >
> > > > > That would be great and very much appreciated.
> > > > > Maybe you can do it in a separate series and then here add just the
> > > > > configuration we need.
> > > > >
> > > > > Thanks,
> > > > > Stefano
> > > > >
> > > >
> > > > Hey Stefano,
> > > >
> > > > I noticed that bpf/vmtest.sh uses images hosted from libbpf's CI/CD. I
> > > > wonder if you have any thoughts on a good repo we may use to pull our
> > > > qcow image(s)? Or a preferred way to host some images, if no repo
> > > > exists?
> > >
> > > Good question!
> > >
> > > I created this group/repo mainily to keep trak of work, not sure if we can
> > > reuse: https://gitlab.com/vsock/
> > >
> > > I can add you there if you need to create new repo, etc.
> > >
> > > But I'm also open to other solutions.
> > >
> >
> > Sounds good to me. I also was considering using virtme-ng, which would
> > avoid the need, at the cost of the dependency. What are your thoughts on
> > that route?
>
> I just saw that Paolo had proposed the same, but his response was off-list
> by mistake!
>
> So I would say it is an explorable path. I have no experience with it, but
> it looks like it could do the job!
Sounds good! I'm currently prototyping with it, so save for unforeseen
issues that will likely be what v1 uses.
Thanks,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device
2025-03-19 21:09 ` Paolo Abeni
2025-03-20 9:08 ` Stefano Garzarella
@ 2025-03-20 20:57 ` Bobby Eshleman
1 sibling, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-20 20:57 UTC (permalink / raw)
To: Paolo Abeni
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Wed, Mar 19, 2025 at 10:09:44PM +0100, Paolo Abeni wrote:
> On 3/12/25 9:59 PM, Bobby Eshleman wrote:
> > @@ -753,6 +783,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> > virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
> >
> > vhost_dev_cleanup(&vsock->dev);
> > + if (vsock->net)
> > + put_net(vsock->net);
>
> put_net() is a deprecated API, you should use put_net_track() instead.
>
Got it, thanks!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
` (2 preceding siblings ...)
2025-03-12 20:59 ` [PATCH v2 3/3] vhost/vsock: use netns of process that opens the vhost-vsock-netns device Bobby Eshleman
@ 2025-03-13 2:28 ` Bobby Eshleman
2025-03-13 15:37 ` Stefano Garzarella
2025-03-21 19:49 ` Michael S. Tsirkin
2025-03-28 17:03 ` Stefano Garzarella
5 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-13 2:28 UTC (permalink / raw)
To: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list
Cc: David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
Hey all,
Apologies for forgetting the 'net-next' prefix on this one. Should I
resend or no?
Best,
Bobby
On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> Picking up Stefano's v1 [1], this series adds netns support to
> vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> namespaces, defering that for future implementation and discussion.
>
> Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> "scoped" vsock, accessible only to sockets in its namespace. If a global
> vsock or scoped vsock share the same CID, the scoped vsock takes
> precedence.
>
> If a socket in a namespace connects with a global vsock, the CID becomes
> unavailable to any VMM in that namespace when creating new vsocks. If
> disconnected, the CID becomes available again.
>
> Testing
>
> QEMU with /dev/vhost-vsock-netns support:
> https://github.com/beshleman/qemu/tree/vsock-netns
>
> Test: Scoped vsocks isolated by namespace
>
> host# ip netns add ns1
> host# ip netns add ns2
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
> host# ip netns exec ns2 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> host# socat - VSOCK-CONNECT:15:1234
> 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
>
> host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
>
> vm1# socat - VSOCK-LISTEN:1234
> foobar1
> vm2# socat - VSOCK-LISTEN:1234
> foobar2
>
> Test: Global vsocks accessible to any namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>
> vm# socat - VSOCK-LISTEN:1234
> foobar
>
> Test: Connecting to global vsock makes CID unavailble to namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> vm# socat - VSOCK-LISTEN:1234
>
> host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
>
> Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
> ---
> Changes in v2:
> - only support vhost-vsock namespaces
> - all g2h namespaces retain old behavior, only common API changes
> impacted by vhost-vsock changes
> - add /dev/vhost-vsock-netns for "opt-in"
> - leave /dev/vhost-vsock to old behavior
> - removed netns module param
> - Link to v1: https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
>
> Changes in v1:
> - added 'netns' module param to vsock.ko to enable the
> network namespace support (disabled by default)
> - added 'vsock_net_eq()' to check the "net" assigned to a socket
> only when 'netns' support is enabled
> - Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
>
> ---
> Stefano Garzarella (3):
> vsock: add network namespace support
> vsock/virtio_transport_common: handle netns of received packets
> vhost/vsock: use netns of process that opens the vhost-vsock-netns device
>
> drivers/vhost/vsock.c | 96 +++++++++++++++++++++++++++------
> include/linux/miscdevice.h | 1 +
> include/linux/virtio_vsock.h | 2 +
> include/net/af_vsock.h | 10 ++--
> net/vmw_vsock/af_vsock.c | 85 +++++++++++++++++++++++------
> net/vmw_vsock/hyperv_transport.c | 2 +-
> net/vmw_vsock/virtio_transport.c | 5 +-
> net/vmw_vsock/virtio_transport_common.c | 14 ++++-
> net/vmw_vsock/vmci_transport.c | 4 +-
> net/vmw_vsock/vsock_loopback.c | 4 +-
> 10 files changed, 180 insertions(+), 43 deletions(-)
> ---
> base-commit: 0ea09cbf8350b70ad44d67a1dcb379008a356034
> change-id: 20250312-vsock-netns-45da9424f726
>
> Best regards,
> --
> Bobby Eshleman <bobbyeshleman@gmail.com>
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-13 2:28 ` [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
@ 2025-03-13 15:37 ` Stefano Garzarella
2025-03-13 16:20 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-13 15:37 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
Hi Bobby,
first of all, thank you for starting this work again!
On Wed, Mar 12, 2025 at 07:28:33PM -0700, Bobby Eshleman wrote:
>Hey all,
>
>Apologies for forgetting the 'net-next' prefix on this one. Should I
>resend or no?
I'd say let's do a firts review cycle on this, then you can re-post.
Please check also maintainer cced, it looks like someone is missing:
https://patchwork.kernel.org/project/netdevbpf/patch/20250312-vsock-netns-v2-1-84bffa1aa97a@gmail.com/
>
>Best,
>Bobby
>
>On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
>> Picking up Stefano's v1 [1], this series adds netns support to
>> vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
>> namespaces, defering that for future implementation and discussion.
>>
>> Any vsock created with /dev/vhost-vsock is a global vsock, accessible
>> from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
>> "scoped" vsock, accessible only to sockets in its namespace. If a global
>> vsock or scoped vsock share the same CID, the scoped vsock takes
>> precedence.
This inside the netns, right?
I mean if we are in a netns, and there is a VM A attached to
/dev/vhost-vsock-netns witch CID=42 and a VM B attached to
/dev/vhost-vsock also with CID=42, this means that VM A will not be
accessible in the netns, but it can be accessible outside of the netns,
right?
>>
>> If a socket in a namespace connects with a global vsock, the CID becomes
>> unavailable to any VMM in that namespace when creating new vsocks. If
>> disconnected, the CID becomes available again.
IIUC if an application in the host running in a netns, is connected to a
guest attached to /dev/vhost-vsock (e.g. CID=42), a new guest can't be
ask for the same CID (42) on /dev/vhost-vsock-netns in the same netns
till that connection is active. Is that right?
>>
>> Testing
>>
>> QEMU with /dev/vhost-vsock-netns support:
>> https://github.com/beshleman/qemu/tree/vsock-netns
You can also use unmodified QEMU using `vhostfd` parameter of
`vhost-vsock-pci` device:
# FD will contain the file descriptor to /dev/vhost-vsock-netns
exec {FD}<>/dev/vhost-vsock-netns
# pass FD to the device, this is used for example by libvirt
qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \
-drive file=fedora.qcow2,format=qcow2,if=virtio \
-object memory-backend-memfd,id=mem,size=512M \
-device vhost-vsock-pci,vhostfd=${FD},guest-cid=42 -nographic
That said, I agree we can extend QEMU with `netns` param too.
BTW, I'm traveling, I'll be back next Tuesday and I hope to take a
deeper look to the patches.
Thanks,
Stefano
>>
>> Test: Scoped vsocks isolated by namespace
>>
>> host# ip netns add ns1
>> host# ip netns add ns2
>> host# ip netns exec ns1 \
>> qemu-system-x86_64 \
>> -m 8G -smp 4 -cpu host -enable-kvm \
>> -serial mon:stdio \
>> -drive if=virtio,file=${IMAGE1} \
>> -device
>> vhost-vsock-pci,netns=on,guest-cid=15
>> host# ip netns exec ns2 \
>> qemu-system-x86_64 \
>> -m 8G -smp 4 -cpu host -enable-kvm \
>> -serial mon:stdio \
>> -drive if=virtio,file=${IMAGE2} \
>> -device vhost-vsock-pci,netns=on,guest-cid=15
>>
>> host# socat - VSOCK-CONNECT:15:1234
>> 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
>>
>> host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>> host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
>>
>> vm1# socat - VSOCK-LISTEN:1234
>> foobar1
>> vm2# socat - VSOCK-LISTEN:1234
>> foobar2
>>
>> Test: Global vsocks accessible to any namespace
>>
>> host# qemu-system-x86_64 \
>> -m 8G -smp 4 -cpu host -enable-kvm \
>> -serial mon:stdio \)
>> -drive if=virtio,file=${IMAGE2} \
>> -device vhost-vsock-pci,guest-cid=15,netns=off
>>
>> host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>>
>> vm# socat - VSOCK-LISTEN:1234
>> foobar
>>
>> Test: Connecting to global vsock makes CID unavailble to namespace
>>
>> host# qemu-system-x86_64 \
>> -m 8G -smp 4 -cpu host -enable-kvm \
>> -serial mon:stdio \
>> -drive if=virtio,file=${IMAGE2} \
>> -device vhost-vsock-pci,guest-cid=15,netns=off
>>
>> vm# socat - VSOCK-LISTEN:1234
>>
>> host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>> host# ip netns exec ns1 \
>> qemu-system-x86_64 \
>> -m 8G -smp 4 -cpu host -enable-kvm \
>> -serial mon:stdio \
>> -drive if=virtio,file=${IMAGE1} \
>> -device vhost-vsock-pci,netns=on,guest-cid=15
>>
>> qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
>>
>> Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>> ---
>> Changes in v2:
>> - only support vhost-vsock namespaces
>> - all g2h namespaces retain old behavior, only common API changes
>> impacted by vhost-vsock changes
>> - add /dev/vhost-vsock-netns for "opt-in"
>> - leave /dev/vhost-vsock to old behavior
>> - removed netns module param
>> - Link to v1: https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
>>
>> Changes in v1:
>> - added 'netns' module param to vsock.ko to enable the
>> network namespace support (disabled by default)
>> - added 'vsock_net_eq()' to check the "net" assigned to a socket
>> only when 'netns' support is enabled
>> - Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
>>
>> ---
>> Stefano Garzarella (3):
>> vsock: add network namespace support
>> vsock/virtio_transport_common: handle netns of received packets
>> vhost/vsock: use netns of process that opens the vhost-vsock-netns device
>>
>> drivers/vhost/vsock.c | 96 +++++++++++++++++++++++++++------
>> include/linux/miscdevice.h | 1 +
>> include/linux/virtio_vsock.h | 2 +
>> include/net/af_vsock.h | 10 ++--
>> net/vmw_vsock/af_vsock.c | 85 +++++++++++++++++++++++------
>> net/vmw_vsock/hyperv_transport.c | 2 +-
>> net/vmw_vsock/virtio_transport.c | 5 +-
>> net/vmw_vsock/virtio_transport_common.c | 14 ++++-
>> net/vmw_vsock/vmci_transport.c | 4 +-
>> net/vmw_vsock/vsock_loopback.c | 4 +-
>> 10 files changed, 180 insertions(+), 43 deletions(-)
>> ---
>> base-commit: 0ea09cbf8350b70ad44d67a1dcb379008a356034
>> change-id: 20250312-vsock-netns-45da9424f726
>>
>> Best regards,
>> --
>> Bobby Eshleman <bobbyeshleman@gmail.com>
>>
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-13 15:37 ` Stefano Garzarella
@ 2025-03-13 16:20 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-13 16:20 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Thu, Mar 13, 2025 at 04:37:16PM +0100, Stefano Garzarella wrote:
> Hi Bobby,
> first of all, thank you for starting this work again!
>
You're welcome, thank you for your work getting it started!
> On Wed, Mar 12, 2025 at 07:28:33PM -0700, Bobby Eshleman wrote:
> > Hey all,
> >
> > Apologies for forgetting the 'net-next' prefix on this one. Should I
> > resend or no?
>
> I'd say let's do a firts review cycle on this, then you can re-post.
> Please check also maintainer cced, it looks like someone is missing:
> https://patchwork.kernel.org/project/netdevbpf/patch/20250312-vsock-netns-v2-1-84bffa1aa97a@gmail.com/
>
Duly noted, I'll double-check the ccs next time. sgtm on the re-post!
> > On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > > Picking up Stefano's v1 [1], this series adds netns support to
> > > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > > namespaces, defering that for future implementation and discussion.
> > >
> > > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > > vsock or scoped vsock share the same CID, the scoped vsock takes
> > > precedence.
>
> This inside the netns, right?
> I mean if we are in a netns, and there is a VM A attached to
> /dev/vhost-vsock-netns witch CID=42 and a VM B attached to /dev/vhost-vsock
> also with CID=42, this means that VM A will not be accessible in the netns,
> but it can be accessible outside of the netns,
> right?
>
In this scenario, CID=42 goes to VM A (/dev/vhost-vsock-netns) for any
socket in its namespace. For any other namespace, CID=42 will go to VM
B (/dev/vhost-vsock).
If I understand your setup correctly:
Namespace 1:
VM A - /dev/vhost-vsock-netns, CID=42
Process X
Namespace 2:
VM B - /dev/vhost-vsock, CID=42
Process Y
Namespace 3:
Process Z
In this scenario, taking connect() as an example:
Process X connect(CID=42) goes to VM A
Process Y connect(CID=42) goes to VM B
Process Z connect(CID=42) goes to VM B
If VM A goes away (migration, shutdown, etc...):
Process X connect(CID=42) also goes to VM B
> > >
> > > If a socket in a namespace connects with a global vsock, the CID becomes
> > > unavailable to any VMM in that namespace when creating new vsocks. If
> > > disconnected, the CID becomes available again.
>
> IIUC if an application in the host running in a netns, is connected to a
> guest attached to /dev/vhost-vsock (e.g. CID=42), a new guest can't be ask
> for the same CID (42) on /dev/vhost-vsock-netns in the same netns till that
> connection is active. Is that right?
>
Right. Here is the scenario I am trying to avoid:
Step 1: namespace 1, VM A allocated with CID 42 on /dev/vhost-vsock
Step 2: namespace 2, connect(CID=42) (this is legal, preserves old
behavior)
Step 3: namespace 2, VM B allocated with CID 42 on
/dev/vhost-vsock-netns
After step 3, CID=42 in this current namespace should belong to VM B, but
the connection from step 2 would be with VM A.
I think we have some options:
1. disallow the new VM B because the namespace is already active with VM A
2. try and allow the connection to resume, but just make sure that new
connections got o VM B
3. close the connection from namespace 2, spin up VM B, hope user
manages connection retry
4. auto-retry connect to the new VM B? (seems like doing too much on the
kernel side to me)
I chose option 1 for this rev mostly for the simplicity but definitely
open to suggestions. I think option 3 is also a simple implementation.
Option 2 would require adding some concept of "vhost-vsock ns at time of
connection" to each socket, so the tranport would know which vhost_vsock
to use for which socket.
> > >
> > > Testing
> > >
> > > QEMU with /dev/vhost-vsock-netns support:
> > > https://github.com/beshleman/qemu/tree/vsock-netns
>
> You can also use unmodified QEMU using `vhostfd` parameter of
> `vhost-vsock-pci` device:
>
> # FD will contain the file descriptor to /dev/vhost-vsock-netns
> exec {FD}<>/dev/vhost-vsock-netns
>
> # pass FD to the device, this is used for example by libvirt
> qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \
> -drive file=fedora.qcow2,format=qcow2,if=virtio \
> -object memory-backend-memfd,id=mem,size=512M \
> -device vhost-vsock-pci,vhostfd=${FD},guest-cid=42 -nographic
>
Very nice, thanks, I didn't realize that!
> That said, I agree we can extend QEMU with `netns` param too.
>
I'm open to either. Your solution above is super elegant.
> BTW, I'm traveling, I'll be back next Tuesday and I hope to take a deeper
> look to the patches.
>
> Thanks,
> Stefano
>
Thanks Stefano! Enjoy the travel.
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
` (3 preceding siblings ...)
2025-03-13 2:28 ` [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
@ 2025-03-21 19:49 ` Michael S. Tsirkin
2025-03-22 1:04 ` Bobby Eshleman
2025-03-28 17:03 ` Stefano Garzarella
5 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2025-03-21 19:49 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> Picking up Stefano's v1 [1], this series adds netns support to
> vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> namespaces, defering that for future implementation and discussion.
>
> Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> "scoped" vsock, accessible only to sockets in its namespace. If a global
> vsock or scoped vsock share the same CID, the scoped vsock takes
> precedence.
>
> If a socket in a namespace connects with a global vsock, the CID becomes
> unavailable to any VMM in that namespace when creating new vsocks. If
> disconnected, the CID becomes available again.
yea that's a sane way to do it.
Thanks!
> Testing
>
> QEMU with /dev/vhost-vsock-netns support:
> https://github.com/beshleman/qemu/tree/vsock-netns
>
> Test: Scoped vsocks isolated by namespace
>
> host# ip netns add ns1
> host# ip netns add ns2
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
> host# ip netns exec ns2 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> host# socat - VSOCK-CONNECT:15:1234
> 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
>
> host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
>
> vm1# socat - VSOCK-LISTEN:1234
> foobar1
> vm2# socat - VSOCK-LISTEN:1234
> foobar2
>
> Test: Global vsocks accessible to any namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>
> vm# socat - VSOCK-LISTEN:1234
> foobar
>
> Test: Connecting to global vsock makes CID unavailble to namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> vm# socat - VSOCK-LISTEN:1234
>
> host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
>
> Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
> ---
> Changes in v2:
> - only support vhost-vsock namespaces
> - all g2h namespaces retain old behavior, only common API changes
> impacted by vhost-vsock changes
> - add /dev/vhost-vsock-netns for "opt-in"
> - leave /dev/vhost-vsock to old behavior
> - removed netns module param
> - Link to v1: https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
>
> Changes in v1:
> - added 'netns' module param to vsock.ko to enable the
> network namespace support (disabled by default)
> - added 'vsock_net_eq()' to check the "net" assigned to a socket
> only when 'netns' support is enabled
> - Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
>
> ---
> Stefano Garzarella (3):
> vsock: add network namespace support
> vsock/virtio_transport_common: handle netns of received packets
> vhost/vsock: use netns of process that opens the vhost-vsock-netns device
>
> drivers/vhost/vsock.c | 96 +++++++++++++++++++++++++++------
> include/linux/miscdevice.h | 1 +
> include/linux/virtio_vsock.h | 2 +
> include/net/af_vsock.h | 10 ++--
> net/vmw_vsock/af_vsock.c | 85 +++++++++++++++++++++++------
> net/vmw_vsock/hyperv_transport.c | 2 +-
> net/vmw_vsock/virtio_transport.c | 5 +-
> net/vmw_vsock/virtio_transport_common.c | 14 ++++-
> net/vmw_vsock/vmci_transport.c | 4 +-
> net/vmw_vsock/vsock_loopback.c | 4 +-
> 10 files changed, 180 insertions(+), 43 deletions(-)
> ---
> base-commit: 0ea09cbf8350b70ad44d67a1dcb379008a356034
> change-id: 20250312-vsock-netns-45da9424f726
>
> Best regards,
> --
> Bobby Eshleman <bobbyeshleman@gmail.com>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-21 19:49 ` Michael S. Tsirkin
@ 2025-03-22 1:04 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-22 1:04 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 21, 2025 at 03:49:38PM -0400, Michael S. Tsirkin wrote:
> On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > Picking up Stefano's v1 [1], this series adds netns support to
> > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > namespaces, defering that for future implementation and discussion.
> >
> > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > vsock or scoped vsock share the same CID, the scoped vsock takes
> > precedence.
> >
> > If a socket in a namespace connects with a global vsock, the CID becomes
> > unavailable to any VMM in that namespace when creating new vsocks. If
> > disconnected, the CID becomes available again.
>
>
> yea that's a sane way to do it.
> Thanks!
>
Sgtm, thank you!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-12 20:59 [PATCH v2 0/3] vsock: add namespace support to vhost-vsock Bobby Eshleman
` (4 preceding siblings ...)
2025-03-21 19:49 ` Michael S. Tsirkin
@ 2025-03-28 17:03 ` Stefano Garzarella
2025-03-28 20:13 ` Bobby Eshleman
2025-04-01 19:05 ` Daniel P. Berrangé
5 siblings, 2 replies; 42+ messages in thread
From: Stefano Garzarella @ 2025-03-28 17:03 UTC (permalink / raw)
To: Bobby Eshleman, Daniel Berrangé
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
CCing Daniel
On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
>Picking up Stefano's v1 [1], this series adds netns support to
>vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
>namespaces, defering that for future implementation and discussion.
>
>Any vsock created with /dev/vhost-vsock is a global vsock, accessible
>from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
>"scoped" vsock, accessible only to sockets in its namespace. If a global
>vsock or scoped vsock share the same CID, the scoped vsock takes
>precedence.
>
>If a socket in a namespace connects with a global vsock, the CID becomes
>unavailable to any VMM in that namespace when creating new vsocks. If
>disconnected, the CID becomes available again.
I was talking about this feature with Daniel and he pointed out
something interesting (Daniel please feel free to correct me):
If we have a process in the host that does a listen(AF_VSOCK) in a
namespace, can this receive connections from guests connected to
/dev/vhost-vsock in any namespace?
Should we provide something (e.g. sysctl/sysfs entry) to disable
this behaviour, preventing a process in a namespace from receiving
connections from the global vsock address space (i.e.
/dev/vhost-vsock VMs)?
I understand that by default maybe we should allow this behaviour in
order to not break current applications, but in some cases the user may
want to isolate sockets in a namespace also from being accessed by VMs
running in the global vsock address space.
Indeed in this series we have talked mostly about the host -> guest path
(as the direction of the connection), but little about the guest -> host
path, maybe we should explain it better in the cover/commit
descriptions/documentation.
Thanks,
Stefano
>
>Testing
>
>QEMU with /dev/vhost-vsock-netns support:
> https://github.com/beshleman/qemu/tree/vsock-netns
>
>Test: Scoped vsocks isolated by namespace
>
> host# ip netns add ns1
> host# ip netns add ns2
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
> host# ip netns exec ns2 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> host# socat - VSOCK-CONNECT:15:1234
> 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
>
> host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
>
> vm1# socat - VSOCK-LISTEN:1234
> foobar1
> vm2# socat - VSOCK-LISTEN:1234
> foobar2
>
>Test: Global vsocks accessible to any namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>
> vm# socat - VSOCK-LISTEN:1234
> foobar
>
>Test: Connecting to global vsock makes CID unavailble to namespace
>
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> vm# socat - VSOCK-LISTEN:1234
>
> host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
>
> qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
>
>Signed-off-by: Bobby Eshleman <bobbyeshleman@gmail.com>
>---
>Changes in v2:
>- only support vhost-vsock namespaces
>- all g2h namespaces retain old behavior, only common API changes
> impacted by vhost-vsock changes
>- add /dev/vhost-vsock-netns for "opt-in"
>- leave /dev/vhost-vsock to old behavior
>- removed netns module param
>- Link to v1: https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
>
>Changes in v1:
>- added 'netns' module param to vsock.ko to enable the
> network namespace support (disabled by default)
>- added 'vsock_net_eq()' to check the "net" assigned to a socket
> only when 'netns' support is enabled
>- Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
>
>---
>Stefano Garzarella (3):
> vsock: add network namespace support
> vsock/virtio_transport_common: handle netns of received packets
> vhost/vsock: use netns of process that opens the vhost-vsock-netns device
>
> drivers/vhost/vsock.c | 96 +++++++++++++++++++++++++++------
> include/linux/miscdevice.h | 1 +
> include/linux/virtio_vsock.h | 2 +
> include/net/af_vsock.h | 10 ++--
> net/vmw_vsock/af_vsock.c | 85 +++++++++++++++++++++++------
> net/vmw_vsock/hyperv_transport.c | 2 +-
> net/vmw_vsock/virtio_transport.c | 5 +-
> net/vmw_vsock/virtio_transport_common.c | 14 ++++-
> net/vmw_vsock/vmci_transport.c | 4 +-
> net/vmw_vsock/vsock_loopback.c | 4 +-
> 10 files changed, 180 insertions(+), 43 deletions(-)
>---
>base-commit: 0ea09cbf8350b70ad44d67a1dcb379008a356034
>change-id: 20250312-vsock-netns-45da9424f726
>
>Best regards,
>--
>Bobby Eshleman <bobbyeshleman@gmail.com>
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-28 17:03 ` Stefano Garzarella
@ 2025-03-28 20:13 ` Bobby Eshleman
2025-04-01 19:05 ` Daniel P. Berrangé
1 sibling, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-03-28 20:13 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Daniel Berrangé, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Fri, Mar 28, 2025 at 06:03:19PM +0100, Stefano Garzarella wrote:
> CCing Daniel
>
> On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > Picking up Stefano's v1 [1], this series adds netns support to
> > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > namespaces, defering that for future implementation and discussion.
> >
> > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > vsock or scoped vsock share the same CID, the scoped vsock takes
> > precedence.
> >
> > If a socket in a namespace connects with a global vsock, the CID becomes
> > unavailable to any VMM in that namespace when creating new vsocks. If
> > disconnected, the CID becomes available again.
>
> I was talking about this feature with Daniel and he pointed out something
> interesting (Daniel please feel free to correct me):
>
> If we have a process in the host that does a listen(AF_VSOCK) in a
> namespace, can this receive connections from guests connected to
> /dev/vhost-vsock in any namespace?
>
> Should we provide something (e.g. sysctl/sysfs entry) to disable
> this behaviour, preventing a process in a namespace from receiving
> connections from the global vsock address space (i.e. /dev/vhost-vsock
> VMs)?
>
> I understand that by default maybe we should allow this behaviour in order
> to not break current applications, but in some cases the user may want to
> isolate sockets in a namespace also from being accessed by VMs running in
> the global vsock address space.
>
Adding this strict namespace mode makes sense to me, and I think the
sysctl/sysfs approach works well to minimize application changes. The
approach we were taking was to only allow /dev/vhost-vsock-netns (no
global /dev/vhost-vsock mixed in on the system), but adding the explicit
system-wide option I think improves the overall security posture of g2h
connections.
> Indeed in this series we have talked mostly about the host -> guest path (as
> the direction of the connection), but little about the guest -> host path,
> maybe we should explain it better in the cover/commit
> descriptions/documentation.
>
Sounds good!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-03-28 17:03 ` Stefano Garzarella
2025-03-28 20:13 ` Bobby Eshleman
@ 2025-04-01 19:05 ` Daniel P. Berrangé
2025-04-02 0:21 ` Bobby Eshleman
1 sibling, 1 reply; 42+ messages in thread
From: Daniel P. Berrangé @ 2025-04-01 19:05 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Bobby Eshleman, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Fri, Mar 28, 2025 at 06:03:19PM +0100, Stefano Garzarella wrote:
> CCing Daniel
>
> On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > Picking up Stefano's v1 [1], this series adds netns support to
> > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > namespaces, defering that for future implementation and discussion.
> >
> > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > vsock or scoped vsock share the same CID, the scoped vsock takes
> > precedence.
> >
> > If a socket in a namespace connects with a global vsock, the CID becomes
> > unavailable to any VMM in that namespace when creating new vsocks. If
> > disconnected, the CID becomes available again.
>
> I was talking about this feature with Daniel and he pointed out something
> interesting (Daniel please feel free to correct me):
>
> If we have a process in the host that does a listen(AF_VSOCK) in a
> namespace, can this receive connections from guests connected to
> /dev/vhost-vsock in any namespace?
>
> Should we provide something (e.g. sysctl/sysfs entry) to disable
> this behaviour, preventing a process in a namespace from receiving
> connections from the global vsock address space (i.e. /dev/vhost-vsock
> VMs)?
I think my concern goes a bit beyond that, to the general conceptual
idea of sharing the CID space between the global vsocks and namespace
vsocks. So I'm not sure a sysctl would be sufficient...details later
below..
> I understand that by default maybe we should allow this behaviour in order
> to not break current applications, but in some cases the user may want to
> isolate sockets in a namespace also from being accessed by VMs running in
> the global vsock address space.
>
> Indeed in this series we have talked mostly about the host -> guest path (as
> the direction of the connection), but little about the guest -> host path,
> maybe we should explain it better in the cover/commit
> descriptions/documentation.
> > Testing
> >
> > QEMU with /dev/vhost-vsock-netns support:
> > https://github.com/beshleman/qemu/tree/vsock-netns
> >
> > Test: Scoped vsocks isolated by namespace
> >
> > host# ip netns add ns1
> > host# ip netns add ns2
> > host# ip netns exec ns1 \
> > qemu-system-x86_64 \
> > -m 8G -smp 4 -cpu host -enable-kvm \
> > -serial mon:stdio \
> > -drive if=virtio,file=${IMAGE1} \
> > -device vhost-vsock-pci,netns=on,guest-cid=15
> > host# ip netns exec ns2 \
> > qemu-system-x86_64 \
> > -m 8G -smp 4 -cpu host -enable-kvm \
> > -serial mon:stdio \
> > -drive if=virtio,file=${IMAGE2} \
> > -device vhost-vsock-pci,netns=on,guest-cid=15
> >
> > host# socat - VSOCK-CONNECT:15:1234
> > 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
> >
> > host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
> >
> > vm1# socat - VSOCK-LISTEN:1234
> > foobar1
> > vm2# socat - VSOCK-LISTEN:1234
> > foobar2
> >
> > Test: Global vsocks accessible to any namespace
> >
> > host# qemu-system-x86_64 \
> > -m 8G -smp 4 -cpu host -enable-kvm \
> > -serial mon:stdio \
> > -drive if=virtio,file=${IMAGE2} \
> > -device vhost-vsock-pci,guest-cid=15,netns=off
> >
> > host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> >
> > vm# socat - VSOCK-LISTEN:1234
> > foobar
> >
> > Test: Connecting to global vsock makes CID unavailble to namespace
> >
> > host# qemu-system-x86_64 \
> > -m 8G -smp 4 -cpu host -enable-kvm \
> > -serial mon:stdio \
> > -drive if=virtio,file=${IMAGE2} \
> > -device vhost-vsock-pci,guest-cid=15,netns=off
> >
> > vm# socat - VSOCK-LISTEN:1234
> >
> > host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > host# ip netns exec ns1 \
> > qemu-system-x86_64 \
> > -m 8G -smp 4 -cpu host -enable-kvm \
> > -serial mon:stdio \
> > -drive if=virtio,file=${IMAGE1} \
> > -device vhost-vsock-pci,netns=on,guest-cid=15
> >
> > qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
I find it conceptually quite unsettling that the VSOCK CID address
space for AF_VSOCK is shared between the host and the namespace.
That feels contrary to how namespaces are more commonly used for
deterministically isolating resources between the namespace and the
host.
Naively I would expect that in a namespace, all VSOCK CIDs are
free for use, without having to concern yourself with what CIDs
are in use in the host now, or in future.
What happens if we reverse the QEMU order above, to get the
following scenario
# Launch VM1 inside the NS
host# ip netns exec ns1 \
qemu-system-x86_64 \
-m 8G -smp 4 -cpu host -enable-kvm \
-serial mon:stdio \
-drive if=virtio,file=${IMAGE1} \
-device vhost-vsock-pci,netns=on,guest-cid=15
# Launch VM2
host# qemu-system-x86_64 \
-m 8G -smp 4 -cpu host -enable-kvm \
-serial mon:stdio \
-drive if=virtio,file=${IMAGE2} \
-device vhost-vsock-pci,guest-cid=15,netns=off
vm1# socat - VSOCK-LISTEN:1234
vm2# socat - VSOCK-LISTEN:1234
host# socat - VSOCK-CONNECT:15:1234
=> Presume this connects to "VM2" running outside the NS
host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
=> Does this connect to "VM1" inside the NS, or "VM2"
outside the NS ?
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-01 19:05 ` Daniel P. Berrangé
@ 2025-04-02 0:21 ` Bobby Eshleman
2025-04-02 8:13 ` Stefano Garzarella
0 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-04-02 0:21 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Tue, Apr 01, 2025 at 08:05:16PM +0100, Daniel P. Berrangé wrote:
> On Fri, Mar 28, 2025 at 06:03:19PM +0100, Stefano Garzarella wrote:
> > CCing Daniel
> >
> > On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > > Picking up Stefano's v1 [1], this series adds netns support to
> > > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > > namespaces, defering that for future implementation and discussion.
> > >
> > > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > > vsock or scoped vsock share the same CID, the scoped vsock takes
> > > precedence.
> > >
> > > If a socket in a namespace connects with a global vsock, the CID becomes
> > > unavailable to any VMM in that namespace when creating new vsocks. If
> > > disconnected, the CID becomes available again.
> >
> > I was talking about this feature with Daniel and he pointed out something
> > interesting (Daniel please feel free to correct me):
> >
> > If we have a process in the host that does a listen(AF_VSOCK) in a
> > namespace, can this receive connections from guests connected to
> > /dev/vhost-vsock in any namespace?
> >
> > Should we provide something (e.g. sysctl/sysfs entry) to disable
> > this behaviour, preventing a process in a namespace from receiving
> > connections from the global vsock address space (i.e. /dev/vhost-vsock
> > VMs)?
>
> I think my concern goes a bit beyond that, to the general conceptual
> idea of sharing the CID space between the global vsocks and namespace
> vsocks. So I'm not sure a sysctl would be sufficient...details later
> below..
>
> > I understand that by default maybe we should allow this behaviour in order
> > to not break current applications, but in some cases the user may want to
> > isolate sockets in a namespace also from being accessed by VMs running in
> > the global vsock address space.
> >
> > Indeed in this series we have talked mostly about the host -> guest path (as
> > the direction of the connection), but little about the guest -> host path,
> > maybe we should explain it better in the cover/commit
> > descriptions/documentation.
>
> > > Testing
> > >
> > > QEMU with /dev/vhost-vsock-netns support:
> > > https://github.com/beshleman/qemu/tree/vsock-netns
> > >
> > > Test: Scoped vsocks isolated by namespace
> > >
> > > host# ip netns add ns1
> > > host# ip netns add ns2
> > > host# ip netns exec ns1 \
> > > qemu-system-x86_64 \
> > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > -serial mon:stdio \
> > > -drive if=virtio,file=${IMAGE1} \
> > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > > host# ip netns exec ns2 \
> > > qemu-system-x86_64 \
> > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > -serial mon:stdio \
> > > -drive if=virtio,file=${IMAGE2} \
> > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > >
> > > host# socat - VSOCK-CONNECT:15:1234
> > > 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
> > >
> > > host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > > host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
> > >
> > > vm1# socat - VSOCK-LISTEN:1234
> > > foobar1
> > > vm2# socat - VSOCK-LISTEN:1234
> > > foobar2
> > >
> > > Test: Global vsocks accessible to any namespace
> > >
> > > host# qemu-system-x86_64 \
> > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > -serial mon:stdio \
> > > -drive if=virtio,file=${IMAGE2} \
> > > -device vhost-vsock-pci,guest-cid=15,netns=off
> > >
> > > host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > >
> > > vm# socat - VSOCK-LISTEN:1234
> > > foobar
> > >
> > > Test: Connecting to global vsock makes CID unavailble to namespace
> > >
> > > host# qemu-system-x86_64 \
> > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > -serial mon:stdio \
> > > -drive if=virtio,file=${IMAGE2} \
> > > -device vhost-vsock-pci,guest-cid=15,netns=off
> > >
> > > vm# socat - VSOCK-LISTEN:1234
> > >
> > > host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > > host# ip netns exec ns1 \
> > > qemu-system-x86_64 \
> > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > -serial mon:stdio \
> > > -drive if=virtio,file=${IMAGE1} \
> > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > >
> > > qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
>
> I find it conceptually quite unsettling that the VSOCK CID address
> space for AF_VSOCK is shared between the host and the namespace.
> That feels contrary to how namespaces are more commonly used for
> deterministically isolating resources between the namespace and the
> host.
>
> Naively I would expect that in a namespace, all VSOCK CIDs are
> free for use, without having to concern yourself with what CIDs
> are in use in the host now, or in future.
>
True, that would be ideal. I think the definition of backwards
compatibility we've established includes the notion that any VM may
reach any namespace and any namespace may reach any VM. IIUC, it sounds
like you are suggesting this be revised to more strictly adhere to
namespace semantics?
I do like Stefano's suggestion to add a sysctl for a "strict" mode,
Since it offers the best of both worlds, and still tends conservative in
protecting existing applications... but I agree, the non-strict mode
vsock would be unique WRT the usual concept of namespaces.
> What happens if we reverse the QEMU order above, to get the
> following scenario
>
> # Launch VM1 inside the NS
> host# ip netns exec ns1 \
> qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE1} \
> -device vhost-vsock-pci,netns=on,guest-cid=15
> # Launch VM2
> host# qemu-system-x86_64 \
> -m 8G -smp 4 -cpu host -enable-kvm \
> -serial mon:stdio \
> -drive if=virtio,file=${IMAGE2} \
> -device vhost-vsock-pci,guest-cid=15,netns=off
>
> vm1# socat - VSOCK-LISTEN:1234
> vm2# socat - VSOCK-LISTEN:1234
>
> host# socat - VSOCK-CONNECT:15:1234
> => Presume this connects to "VM2" running outside the NS
>
> host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
>
> => Does this connect to "VM1" inside the NS, or "VM2"
> outside the NS ?
>
VM1 inside the NS. Current logic says that whenever two CIDs collide
(local vs global), always select the one in the local namespace
(irrespective of creation order).
Adding a sysctl option... it would *never* connect to the global one,
even if there was no local match but there was a global one.
>
>
> With regards,
> Daniel
Thanks for the review!
Best,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 0:21 ` Bobby Eshleman
@ 2025-04-02 8:13 ` Stefano Garzarella
2025-04-02 9:21 ` Daniel P. Berrangé
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-04-02 8:13 UTC (permalink / raw)
To: Bobby Eshleman, Daniel P. Berrangé
Cc: Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
>
> On Tue, Apr 01, 2025 at 08:05:16PM +0100, Daniel P. Berrangé wrote:
> > On Fri, Mar 28, 2025 at 06:03:19PM +0100, Stefano Garzarella wrote:
> > > CCing Daniel
> > >
> > > On Wed, Mar 12, 2025 at 01:59:34PM -0700, Bobby Eshleman wrote:
> > > > Picking up Stefano's v1 [1], this series adds netns support to
> > > > vhost-vsock. Unlike v1, this series does not address guest-to-host (g2h)
> > > > namespaces, defering that for future implementation and discussion.
> > > >
> > > > Any vsock created with /dev/vhost-vsock is a global vsock, accessible
> > > > from any namespace. Any vsock created with /dev/vhost-vsock-netns is a
> > > > "scoped" vsock, accessible only to sockets in its namespace. If a global
> > > > vsock or scoped vsock share the same CID, the scoped vsock takes
> > > > precedence.
> > > >
> > > > If a socket in a namespace connects with a global vsock, the CID becomes
> > > > unavailable to any VMM in that namespace when creating new vsocks. If
> > > > disconnected, the CID becomes available again.
> > >
> > > I was talking about this feature with Daniel and he pointed out something
> > > interesting (Daniel please feel free to correct me):
> > >
> > > If we have a process in the host that does a listen(AF_VSOCK) in a
> > > namespace, can this receive connections from guests connected to
> > > /dev/vhost-vsock in any namespace?
> > >
> > > Should we provide something (e.g. sysctl/sysfs entry) to disable
> > > this behaviour, preventing a process in a namespace from receiving
> > > connections from the global vsock address space (i.e. /dev/vhost-vsock
> > > VMs)?
> >
> > I think my concern goes a bit beyond that, to the general conceptual
> > idea of sharing the CID space between the global vsocks and namespace
> > vsocks. So I'm not sure a sysctl would be sufficient...details later
> > below..
> >
> > > I understand that by default maybe we should allow this behaviour in order
> > > to not break current applications, but in some cases the user may want to
> > > isolate sockets in a namespace also from being accessed by VMs running in
> > > the global vsock address space.
> > >
> > > Indeed in this series we have talked mostly about the host -> guest path (as
> > > the direction of the connection), but little about the guest -> host path,
> > > maybe we should explain it better in the cover/commit
> > > descriptions/documentation.
> >
> > > > Testing
> > > >
> > > > QEMU with /dev/vhost-vsock-netns support:
> > > > https://github.com/beshleman/qemu/tree/vsock-netns
> > > >
> > > > Test: Scoped vsocks isolated by namespace
> > > >
> > > > host# ip netns add ns1
> > > > host# ip netns add ns2
> > > > host# ip netns exec ns1 \
> > > > qemu-system-x86_64 \
> > > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > > -serial mon:stdio \
> > > > -drive if=virtio,file=${IMAGE1} \
> > > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > > > host# ip netns exec ns2 \
> > > > qemu-system-x86_64 \
> > > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > > -serial mon:stdio \
> > > > -drive if=virtio,file=${IMAGE2} \
> > > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > > >
> > > > host# socat - VSOCK-CONNECT:15:1234
> > > > 2025/03/10 17:09:40 socat[255741] E connect(5, AF=40 cid:15 port:1234, 16): No such device
> > > >
> > > > host# echo foobar1 | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > > > host# echo foobar2 | sudo ip netns exec ns2 socat - VSOCK-CONNECT:15:1234
> > > >
> > > > vm1# socat - VSOCK-LISTEN:1234
> > > > foobar1
> > > > vm2# socat - VSOCK-LISTEN:1234
> > > > foobar2
> > > >
> > > > Test: Global vsocks accessible to any namespace
> > > >
> > > > host# qemu-system-x86_64 \
> > > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > > -serial mon:stdio \
> > > > -drive if=virtio,file=${IMAGE2} \
> > > > -device vhost-vsock-pci,guest-cid=15,netns=off
> > > >
> > > > host# echo foobar | sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > > >
> > > > vm# socat - VSOCK-LISTEN:1234
> > > > foobar
> > > >
> > > > Test: Connecting to global vsock makes CID unavailble to namespace
> > > >
> > > > host# qemu-system-x86_64 \
> > > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > > -serial mon:stdio \
> > > > -drive if=virtio,file=${IMAGE2} \
> > > > -device vhost-vsock-pci,guest-cid=15,netns=off
> > > >
> > > > vm# socat - VSOCK-LISTEN:1234
> > > >
> > > > host# sudo ip netns exec ns1 socat - VSOCK-CONNECT:15:1234
> > > > host# ip netns exec ns1 \
> > > > qemu-system-x86_64 \
> > > > -m 8G -smp 4 -cpu host -enable-kvm \
> > > > -serial mon:stdio \
> > > > -drive if=virtio,file=${IMAGE1} \
> > > > -device vhost-vsock-pci,netns=on,guest-cid=15
> > > >
> > > > qemu-system-x86_64: -device vhost-vsock-pci,netns=on,guest-cid=15: vhost-vsock: unable to set guest cid: Address already in use
> >
> > I find it conceptually quite unsettling that the VSOCK CID address
> > space for AF_VSOCK is shared between the host and the namespace.
> > That feels contrary to how namespaces are more commonly used for
> > deterministically isolating resources between the namespace and the
> > host.
> >
> > Naively I would expect that in a namespace, all VSOCK CIDs are
> > free for use, without having to concern yourself with what CIDs
> > are in use in the host now, or in future.
> >
>
> True, that would be ideal. I think the definition of backwards
> compatibility we've established includes the notion that any VM may
> reach any namespace and any namespace may reach any VM. IIUC, it
> sounds
> like you are suggesting this be revised to more strictly adhere to
> namespace semantics?
>
> I do like Stefano's suggestion to add a sysctl for a "strict" mode,
> Since it offers the best of both worlds, and still tends conservative in
> protecting existing applications... but I agree, the non-strict mode
> vsock would be unique WRT the usual concept of namespaces.
Maybe we could do the opposite, enable strict mode by default (I think
it was similar to what I had tried to do with the kernel module in v1, I
was young I know xD)
And provide a way to disable it for those use cases where the user wants
backward compatibility, while paying the cost of less isolation.
I was thinking two options (not sure if the second one can be done):
1. provide a global sysfs/sysctl that disables strict mode, but this
then applies to all namespaces
2. provide something that allows disabling strict mode by namespace.
Maybe when it is created there are options, or something that can be
set later.
2 would be ideal, but that might be too much, so 1 might be enough. In
any case, 2 could also be a next step.
WDYT?
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 8:13 ` Stefano Garzarella
@ 2025-04-02 9:21 ` Daniel P. Berrangé
2025-04-02 22:18 ` Bobby Eshleman
2025-04-03 9:01 ` Stefano Garzarella
0 siblings, 2 replies; 42+ messages in thread
From: Daniel P. Berrangé @ 2025-04-02 9:21 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Bobby Eshleman, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
> On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
> >
> > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
> > Since it offers the best of both worlds, and still tends conservative in
> > protecting existing applications... but I agree, the non-strict mode
> > vsock would be unique WRT the usual concept of namespaces.
>
> Maybe we could do the opposite, enable strict mode by default (I think
> it was similar to what I had tried to do with the kernel module in v1, I
> was young I know xD)
> And provide a way to disable it for those use cases where the user wants
> backward compatibility, while paying the cost of less isolation.
I think backwards compatible has to be the default behaviour, otherwise
the change has too high risk of breaking existing deployments that are
already using netns and relying on VSOCK being global. Breakage has to
be opt in.
> I was thinking two options (not sure if the second one can be done):
>
> 1. provide a global sysfs/sysctl that disables strict mode, but this
> then applies to all namespaces
>
> 2. provide something that allows disabling strict mode by namespace.
> Maybe when it is created there are options, or something that can be
> set later.
>
> 2 would be ideal, but that might be too much, so 1 might be enough. In
> any case, 2 could also be a next step.
>
> WDYT?
It occured to me that the problem we face with the CID space usage is
somewhat similar to the UID/GID space usage for user namespaces.
In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
At the risk of being overkill, is it worth trying a similar kind of
approach for the vsock CID space ?
A simple variant would be a /proc/net/vsock_cid_outside specifying a set
of CIDs which are exclusively referencing /dev/vhost-vsock associations
created outside the namespace. Anything not listed would be exclusively
referencing associations created inside the namespace.
A more complex variant would be to allow a full remapping of CIDs as is
done with userns, via a /proc/net/vsock_cid_map, which the same three
parameters, so that CID=15 association outside the namespace could be
remapped to CID=9015 inside the namespace, allow the inside namespace
to define its out association for CID=15 without clashing.
IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
associations created outside namespace, while unmapped CIDs would be
exclusively referencing /dev/vhost-vsock associations inside the
namespace.
A likely benefit of relying on a kernel defined mapping/partition of
the CID space is that apps like QEMU don't need changing, as there's
no need to invent a new /dev/vhost-vsock-netns device node.
Both approaches give the desirable security protection whereby the
inside namespace can be prevented from accessing certain CIDs that
were associated outside the namespace.
Some rule would need to be defined for updating the /proc/net/vsock_cid_map
file as it is the security control mechanism. If it is write-once then
if the container mgmt app initializes it, nothing later could change
it.
A key question is do we need the "first come, first served" behaviour
for CIDs where a CID can be arbitrarily used by outside or inside namespace
according to whatever tries to associate a CID first ?
IMHO those semantics lead to unpredictable behaviour for apps because
what happens depends on ordering of app launches inside & outside the
namespace, but they do sort of allow for VSOCK namespace behaviour to
be 'zero conf' out of the box.
A mapping that strictly partitions CIDs to either outside or inside
namespace usage, but never both, gives well defined behaviour, at the
cost of needing to setup an initial mapping/partition.
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 9:21 ` Daniel P. Berrangé
@ 2025-04-02 22:18 ` Bobby Eshleman
2025-04-02 22:28 ` Bobby Eshleman
2025-04-04 13:05 ` Daniel P. Berrangé
2025-04-03 9:01 ` Stefano Garzarella
1 sibling, 2 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-04-02 22:18 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
> On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
> > On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
> > >
> > > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
> > > Since it offers the best of both worlds, and still tends conservative in
> > > protecting existing applications... but I agree, the non-strict mode
> > > vsock would be unique WRT the usual concept of namespaces.
> >
> > Maybe we could do the opposite, enable strict mode by default (I think
> > it was similar to what I had tried to do with the kernel module in v1, I
> > was young I know xD)
> > And provide a way to disable it for those use cases where the user wants
> > backward compatibility, while paying the cost of less isolation.
>
> I think backwards compatible has to be the default behaviour, otherwise
> the change has too high risk of breaking existing deployments that are
> already using netns and relying on VSOCK being global. Breakage has to
> be opt in.
>
> > I was thinking two options (not sure if the second one can be done):
> >
> > 1. provide a global sysfs/sysctl that disables strict mode, but this
> > then applies to all namespaces
> >
> > 2. provide something that allows disabling strict mode by namespace.
> > Maybe when it is created there are options, or something that can be
> > set later.
> >
> > 2 would be ideal, but that might be too much, so 1 might be enough. In
> > any case, 2 could also be a next step.
> >
> > WDYT?
>
> It occured to me that the problem we face with the CID space usage is
> somewhat similar to the UID/GID space usage for user namespaces.
>
> In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
> allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
>
> At the risk of being overkill, is it worth trying a similar kind of
> approach for the vsock CID space ?
>
> A simple variant would be a /proc/net/vsock_cid_outside specifying a set
> of CIDs which are exclusively referencing /dev/vhost-vsock associations
> created outside the namespace. Anything not listed would be exclusively
> referencing associations created inside the namespace.
>
> A more complex variant would be to allow a full remapping of CIDs as is
> done with userns, via a /proc/net/vsock_cid_map, which the same three
> parameters, so that CID=15 association outside the namespace could be
> remapped to CID=9015 inside the namespace, allow the inside namespace
> to define its out association for CID=15 without clashing.
>
> IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
> associations created outside namespace, while unmapped CIDs would be
> exclusively referencing /dev/vhost-vsock associations inside the
> namespace.
>
> A likely benefit of relying on a kernel defined mapping/partition of
> the CID space is that apps like QEMU don't need changing, as there's
> no need to invent a new /dev/vhost-vsock-netns device node.
>
> Both approaches give the desirable security protection whereby the
> inside namespace can be prevented from accessing certain CIDs that
> were associated outside the namespace.
>
> Some rule would need to be defined for updating the /proc/net/vsock_cid_map
> file as it is the security control mechanism. If it is write-once then
> if the container mgmt app initializes it, nothing later could change
> it.
>
> A key question is do we need the "first come, first served" behaviour
> for CIDs where a CID can be arbitrarily used by outside or inside namespace
> according to whatever tries to associate a CID first ?
I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
from being used, this could be solved by disallowing remapping the CID
while in use?
The thing I like about this is that users can check
/proc/net/vsock_cid_outside to figure out what might be going on,
instead of trying to check lsof or ps to figure out if the VMM processes
have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
Just to check I am following... I suppose we would have a few typical
configurations for /proc/net/vsock_cid_outside. Following uid_map file
format of:
"<local cid start> <global cid start> <range size>"
1. Identity mapping, current namespace CID is global CID (default
setting for new namespaces):
# empty file
OR
0 0 4294967295
2. Complete isolation from global space (initialized, but no mappings):
0 0 0
3. Mapping in ranges of global CIDs
For example, global CID space starts at 7000, up to 32-bit max:
7000 0 4294960295
Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
8000-8100) :
7000 0 100
8000 1000 100
One thing I don't love is that option 3 seems to not be addressing a
known use case. It doesn't necessarily hurt to have, but it will add
complexity to CID handling that might never get used?
Since options 1/2 could also be represented by a boolean (yes/no
"current ns shares CID with global"), I wonder if we could either A)
only support the first two options at first, or B) add just
/proc/net/vsock_ns_mode at first, which supports only "global" and
"local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
or the full mapping if the need arises?
This could also be how we support Option 2 from Stefano's last email of
supporting per-namespace opt-in/opt-out.
Any thoughts on this?
>
> IMHO those semantics lead to unpredictable behaviour for apps because
> what happens depends on ordering of app launches inside & outside the
> namespace, but they do sort of allow for VSOCK namespace behaviour to
> be 'zero conf' out of the box.
>
> A mapping that strictly partitions CIDs to either outside or inside
> namespace usage, but never both, gives well defined behaviour, at the
> cost of needing to setup an initial mapping/partition.
>
Agreed, I do like the plainness of reasoning through it.
Thanks!
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 22:18 ` Bobby Eshleman
@ 2025-04-02 22:28 ` Bobby Eshleman
2025-04-03 9:33 ` Stefano Garzarella
2025-04-04 13:05 ` Daniel P. Berrangé
1 sibling, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-04-02 22:28 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
> On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
> > On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
> > > On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
> > > >
> > > > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
> > > > Since it offers the best of both worlds, and still tends conservative in
> > > > protecting existing applications... but I agree, the non-strict mode
> > > > vsock would be unique WRT the usual concept of namespaces.
> > >
> > > Maybe we could do the opposite, enable strict mode by default (I think
> > > it was similar to what I had tried to do with the kernel module in v1, I
> > > was young I know xD)
> > > And provide a way to disable it for those use cases where the user wants
> > > backward compatibility, while paying the cost of less isolation.
> >
> > I think backwards compatible has to be the default behaviour, otherwise
> > the change has too high risk of breaking existing deployments that are
> > already using netns and relying on VSOCK being global. Breakage has to
> > be opt in.
> >
> > > I was thinking two options (not sure if the second one can be done):
> > >
> > > 1. provide a global sysfs/sysctl that disables strict mode, but this
> > > then applies to all namespaces
> > >
> > > 2. provide something that allows disabling strict mode by namespace.
> > > Maybe when it is created there are options, or something that can be
> > > set later.
> > >
> > > 2 would be ideal, but that might be too much, so 1 might be enough. In
> > > any case, 2 could also be a next step.
> > >
> > > WDYT?
> >
> > It occured to me that the problem we face with the CID space usage is
> > somewhat similar to the UID/GID space usage for user namespaces.
> >
> > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
> > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
> >
> > At the risk of being overkill, is it worth trying a similar kind of
> > approach for the vsock CID space ?
> >
> > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
> > of CIDs which are exclusively referencing /dev/vhost-vsock associations
> > created outside the namespace. Anything not listed would be exclusively
> > referencing associations created inside the namespace.
> >
> > A more complex variant would be to allow a full remapping of CIDs as is
> > done with userns, via a /proc/net/vsock_cid_map, which the same three
> > parameters, so that CID=15 association outside the namespace could be
> > remapped to CID=9015 inside the namespace, allow the inside namespace
> > to define its out association for CID=15 without clashing.
> >
> > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
> > associations created outside namespace, while unmapped CIDs would be
> > exclusively referencing /dev/vhost-vsock associations inside the
> > namespace.
> >
> > A likely benefit of relying on a kernel defined mapping/partition of
> > the CID space is that apps like QEMU don't need changing, as there's
> > no need to invent a new /dev/vhost-vsock-netns device node.
> >
> > Both approaches give the desirable security protection whereby the
> > inside namespace can be prevented from accessing certain CIDs that
> > were associated outside the namespace.
> >
> > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
> > file as it is the security control mechanism. If it is write-once then
> > if the container mgmt app initializes it, nothing later could change
> > it.
> >
> > A key question is do we need the "first come, first served" behaviour
> > for CIDs where a CID can be arbitrarily used by outside or inside namespace
> > according to whatever tries to associate a CID first ?
>
> I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
> from being used, this could be solved by disallowing remapping the CID
> while in use?
>
> The thing I like about this is that users can check
> /proc/net/vsock_cid_outside to figure out what might be going on,
> instead of trying to check lsof or ps to figure out if the VMM processes
> have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
>
> Just to check I am following... I suppose we would have a few typical
> configurations for /proc/net/vsock_cid_outside. Following uid_map file
> format of:
> "<local cid start> <global cid start> <range size>"
>
> 1. Identity mapping, current namespace CID is global CID (default
> setting for new namespaces):
>
> # empty file
>
> OR
>
> 0 0 4294967295
>
> 2. Complete isolation from global space (initialized, but no mappings):
>
> 0 0 0
>
> 3. Mapping in ranges of global CIDs
>
> For example, global CID space starts at 7000, up to 32-bit max:
>
> 7000 0 4294960295
>
> Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
> 8000-8100) :
>
> 7000 0 100
> 8000 1000 100
>
>
> One thing I don't love is that option 3 seems to not be addressing a
> known use case. It doesn't necessarily hurt to have, but it will add
> complexity to CID handling that might never get used?
>
> Since options 1/2 could also be represented by a boolean (yes/no
> "current ns shares CID with global"), I wonder if we could either A)
> only support the first two options at first, or B) add just
> /proc/net/vsock_ns_mode at first, which supports only "global" and
> "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
> or the full mapping if the need arises?
>
> This could also be how we support Option 2 from Stefano's last email of
> supporting per-namespace opt-in/opt-out.
>
> Any thoughts on this?
>
Stefano,
Would only supporting 1/2 still support the Kata use case?
Thanks,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 22:28 ` Bobby Eshleman
@ 2025-04-03 9:33 ` Stefano Garzarella
2025-04-03 19:42 ` Bobby Eshleman
0 siblings, 1 reply; 42+ messages in thread
From: Stefano Garzarella @ 2025-04-03 9:33 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Daniel P. Berrangé, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Wed, Apr 02, 2025 at 03:28:19PM -0700, Bobby Eshleman wrote:
>On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
>> On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
>> > On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
>> > > On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
>> > > >
>> > > > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
>> > > > Since it offers the best of both worlds, and still tends conservative in
>> > > > protecting existing applications... but I agree, the non-strict mode
>> > > > vsock would be unique WRT the usual concept of namespaces.
>> > >
>> > > Maybe we could do the opposite, enable strict mode by default (I think
>> > > it was similar to what I had tried to do with the kernel module in v1, I
>> > > was young I know xD)
>> > > And provide a way to disable it for those use cases where the user wants
>> > > backward compatibility, while paying the cost of less isolation.
>> >
>> > I think backwards compatible has to be the default behaviour, otherwise
>> > the change has too high risk of breaking existing deployments that are
>> > already using netns and relying on VSOCK being global. Breakage has to
>> > be opt in.
>> >
>> > > I was thinking two options (not sure if the second one can be done):
>> > >
>> > > 1. provide a global sysfs/sysctl that disables strict mode, but this
>> > > then applies to all namespaces
>> > >
>> > > 2. provide something that allows disabling strict mode by namespace.
>> > > Maybe when it is created there are options, or something that can be
>> > > set later.
>> > >
>> > > 2 would be ideal, but that might be too much, so 1 might be enough. In
>> > > any case, 2 could also be a next step.
>> > >
>> > > WDYT?
>> >
>> > It occured to me that the problem we face with the CID space usage is
>> > somewhat similar to the UID/GID space usage for user namespaces.
>> >
>> > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
>> > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
>> >
>> > At the risk of being overkill, is it worth trying a similar kind of
>> > approach for the vsock CID space ?
>> >
>> > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
>> > of CIDs which are exclusively referencing /dev/vhost-vsock associations
>> > created outside the namespace. Anything not listed would be exclusively
>> > referencing associations created inside the namespace.
>> >
>> > A more complex variant would be to allow a full remapping of CIDs as is
>> > done with userns, via a /proc/net/vsock_cid_map, which the same three
>> > parameters, so that CID=15 association outside the namespace could be
>> > remapped to CID=9015 inside the namespace, allow the inside namespace
>> > to define its out association for CID=15 without clashing.
>> >
>> > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
>> > associations created outside namespace, while unmapped CIDs would be
>> > exclusively referencing /dev/vhost-vsock associations inside the
>> > namespace.
>> >
>> > A likely benefit of relying on a kernel defined mapping/partition of
>> > the CID space is that apps like QEMU don't need changing, as there's
>> > no need to invent a new /dev/vhost-vsock-netns device node.
>> >
>> > Both approaches give the desirable security protection whereby the
>> > inside namespace can be prevented from accessing certain CIDs that
>> > were associated outside the namespace.
>> >
>> > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
>> > file as it is the security control mechanism. If it is write-once then
>> > if the container mgmt app initializes it, nothing later could change
>> > it.
>> >
>> > A key question is do we need the "first come, first served" behaviour
>> > for CIDs where a CID can be arbitrarily used by outside or inside namespace
>> > according to whatever tries to associate a CID first ?
>>
>> I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
>> from being used, this could be solved by disallowing remapping the CID
>> while in use?
>>
>> The thing I like about this is that users can check
>> /proc/net/vsock_cid_outside to figure out what might be going on,
>> instead of trying to check lsof or ps to figure out if the VMM processes
>> have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
Yes, although the user in theory should not care about this information,
right?
I mean I don't even know if it makes sense to expose the contents of
/proc/net/vsock_cid_outside in the namespace.
>>
>> Just to check I am following... I suppose we would have a few typical
>> configurations for /proc/net/vsock_cid_outside. Following uid_map file
>> format of:
>> "<local cid start> <global cid start> <range size>"
This seems to relate more to /proc/net/vsock_cid_map, for
/proc/net/vsock_cid_outside I think 2 parameters are enough
(CID, range), right?
>>
>> 1. Identity mapping, current namespace CID is global CID (default
>> setting for new namespaces):
>>
>> # empty file
>>
>> OR
>>
>> 0 0 4294967295
>>
>> 2. Complete isolation from global space (initialized, but no mappings):
>>
>> 0 0 0
>>
>> 3. Mapping in ranges of global CIDs
>>
>> For example, global CID space starts at 7000, up to 32-bit max:
>>
>> 7000 0 4294960295
>>
>> Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
>> 8000-8100) :
>>
>> 7000 0 100
>> 8000 1000 100
>>
>>
>> One thing I don't love is that option 3 seems to not be addressing a
>> known use case. It doesn't necessarily hurt to have, but it will add
>> complexity to CID handling that might never get used?
Yes, as I also mentioned in the previous email, we could also do a
step-by-step thing.
IMHO we can define /proc/net/vsock_cid_map (with the structure you just
defined), but for now only support 1-1 mapping (with the ranges of
course, I mean the first two parameters should always be the same) and
then add option 3 in the future.
>>
>> Since options 1/2 could also be represented by a boolean (yes/no
>> "current ns shares CID with global"), I wonder if we could either A)
>> only support the first two options at first, or B) add just
>> /proc/net/vsock_ns_mode at first, which supports only "global" and
>> "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
>> or the full mapping if the need arises?
I think option A is the same as I meant above :-)
>>
>> This could also be how we support Option 2 from Stefano's last email of
>> supporting per-namespace opt-in/opt-out.
Hmm, how can we do it by namespace? Isn't that global?
>>
>> Any thoughts on this?
>>
>
>Stefano,
>
>Would only supporting 1/2 still support the Kata use case?
I think so, actually I was thinking something similar in the message I
just sent.
By default (if the file is empty), nothing should change, so that's fine
IMO. As Paolo suggested, we absolutely have to have tests to verify
these things.
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-03 9:33 ` Stefano Garzarella
@ 2025-04-03 19:42 ` Bobby Eshleman
0 siblings, 0 replies; 42+ messages in thread
From: Bobby Eshleman @ 2025-04-03 19:42 UTC (permalink / raw)
To: Stefano Garzarella
Cc: Daniel P. Berrangé, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Thu, Apr 03, 2025 at 11:33:14AM +0200, Stefano Garzarella wrote:
> On Wed, Apr 02, 2025 at 03:28:19PM -0700, Bobby Eshleman wrote:
> > On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
> > > On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
> > > > On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
> > > > > On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
> > > > > >
> > > > > > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
> > > > > > Since it offers the best of both worlds, and still tends conservative in
> > > > > > protecting existing applications... but I agree, the non-strict mode
> > > > > > vsock would be unique WRT the usual concept of namespaces.
> > > > >
> > > > > Maybe we could do the opposite, enable strict mode by default (I think
> > > > > it was similar to what I had tried to do with the kernel module in v1, I
> > > > > was young I know xD)
> > > > > And provide a way to disable it for those use cases where the user wants
> > > > > backward compatibility, while paying the cost of less isolation.
> > > >
> > > > I think backwards compatible has to be the default behaviour, otherwise
> > > > the change has too high risk of breaking existing deployments that are
> > > > already using netns and relying on VSOCK being global. Breakage has to
> > > > be opt in.
> > > >
> > > > > I was thinking two options (not sure if the second one can be done):
> > > > >
> > > > > 1. provide a global sysfs/sysctl that disables strict mode, but this
> > > > > then applies to all namespaces
> > > > >
> > > > > 2. provide something that allows disabling strict mode by namespace.
> > > > > Maybe when it is created there are options, or something that can be
> > > > > set later.
> > > > >
> > > > > 2 would be ideal, but that might be too much, so 1 might be enough. In
> > > > > any case, 2 could also be a next step.
> > > > >
> > > > > WDYT?
> > > >
> > > > It occured to me that the problem we face with the CID space usage is
> > > > somewhat similar to the UID/GID space usage for user namespaces.
> > > >
> > > > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
> > > > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
> > > >
> > > > At the risk of being overkill, is it worth trying a similar kind of
> > > > approach for the vsock CID space ?
> > > >
> > > > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
> > > > of CIDs which are exclusively referencing /dev/vhost-vsock associations
> > > > created outside the namespace. Anything not listed would be exclusively
> > > > referencing associations created inside the namespace.
> > > >
> > > > A more complex variant would be to allow a full remapping of CIDs as is
> > > > done with userns, via a /proc/net/vsock_cid_map, which the same three
> > > > parameters, so that CID=15 association outside the namespace could be
> > > > remapped to CID=9015 inside the namespace, allow the inside namespace
> > > > to define its out association for CID=15 without clashing.
> > > >
> > > > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
> > > > associations created outside namespace, while unmapped CIDs would be
> > > > exclusively referencing /dev/vhost-vsock associations inside the
> > > > namespace.
> > > >
> > > > A likely benefit of relying on a kernel defined mapping/partition of
> > > > the CID space is that apps like QEMU don't need changing, as there's
> > > > no need to invent a new /dev/vhost-vsock-netns device node.
> > > >
> > > > Both approaches give the desirable security protection whereby the
> > > > inside namespace can be prevented from accessing certain CIDs that
> > > > were associated outside the namespace.
> > > >
> > > > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
> > > > file as it is the security control mechanism. If it is write-once then
> > > > if the container mgmt app initializes it, nothing later could change
> > > > it.
> > > >
> > > > A key question is do we need the "first come, first served" behaviour
> > > > for CIDs where a CID can be arbitrarily used by outside or inside namespace
> > > > according to whatever tries to associate a CID first ?
> > >
> > > I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
> > > from being used, this could be solved by disallowing remapping the CID
> > > while in use?
> > >
> > > The thing I like about this is that users can check
> > > /proc/net/vsock_cid_outside to figure out what might be going on,
> > > instead of trying to check lsof or ps to figure out if the VMM processes
> > > have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
>
> Yes, although the user in theory should not care about this information,
> right?
> I mean I don't even know if it makes sense to expose the contents of
> /proc/net/vsock_cid_outside in the namespace.
>
> > >
> > > Just to check I am following... I suppose we would have a few typical
> > > configurations for /proc/net/vsock_cid_outside. Following uid_map file
> > > format of:
> > > "<local cid start> <global cid start> <range size>"
>
> This seems to relate more to /proc/net/vsock_cid_map, for
> /proc/net/vsock_cid_outside I think 2 parameters are enough
> (CID, range), right?
>
True, yes vsock_cid_map.
> > >
> > > 1. Identity mapping, current namespace CID is global CID (default
> > > setting for new namespaces):
> > >
> > > # empty file
> > >
> > > OR
> > >
> > > 0 0 4294967295
> > >
> > > 2. Complete isolation from global space (initialized, but no mappings):
> > >
> > > 0 0 0
> > >
> > > 3. Mapping in ranges of global CIDs
> > >
> > > For example, global CID space starts at 7000, up to 32-bit max:
> > >
> > > 7000 0 4294960295
> > >
> > > Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
> > > 8000-8100) :
> > >
> > > 7000 0 100
> > > 8000 1000 100
> > >
> > >
> > > One thing I don't love is that option 3 seems to not be addressing a
> > > known use case. It doesn't necessarily hurt to have, but it will add
> > > complexity to CID handling that might never get used?
>
> Yes, as I also mentioned in the previous email, we could also do a
> step-by-step thing.
>
> IMHO we can define /proc/net/vsock_cid_map (with the structure you just
> defined), but for now only support 1-1 mapping (with the ranges of
> course, I mean the first two parameters should always be the same) and
> then add option 3 in the future.
>
makes sense, sgtm!
> > >
> > > Since options 1/2 could also be represented by a boolean (yes/no
> > > "current ns shares CID with global"), I wonder if we could either A)
> > > only support the first two options at first, or B) add just
> > > /proc/net/vsock_ns_mode at first, which supports only "global" and
> > > "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
> > > or the full mapping if the need arises?
>
> I think option A is the same as I meant above :-)
>
Indeed.
> > >
> > > This could also be how we support Option 2 from Stefano's last email of
> > > supporting per-namespace opt-in/opt-out.
>
> Hmm, how can we do it by namespace? Isn't that global?
>
I think the file path is global but the contents are tied per-namespace,
according to the namespace of the process that called open() on it.
This way the container mgr can write-once lock it, and the namespace
processes can read it?
> > >
> > > Any thoughts on this?
> > >
> >
> > Stefano,
> >
> > Would only supporting 1/2 still support the Kata use case?
>
> I think so, actually I was thinking something similar in the message I just
> sent.
>
> By default (if the file is empty), nothing should change, so that's fine
> IMO. As Paolo suggested, we absolutely have to have tests to verify these
> things.
>
Sounds like a plan! I'm working on the new vsock vmtest now and will
include the new tests in the next rev.
Also, I'm thinking we should protect vsock_cid_map behind a capability,
but I'm not sure which one is correct (CAP_NET_ADMIN?). WDYT?
Thanks!
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 22:18 ` Bobby Eshleman
2025-04-02 22:28 ` Bobby Eshleman
@ 2025-04-04 13:05 ` Daniel P. Berrangé
2025-04-18 17:57 ` Bobby Eshleman
1 sibling, 1 reply; 42+ messages in thread
From: Daniel P. Berrangé @ 2025-04-04 13:05 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
> On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
> > It occured to me that the problem we face with the CID space usage is
> > somewhat similar to the UID/GID space usage for user namespaces.
> >
> > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
> > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
> >
> > At the risk of being overkill, is it worth trying a similar kind of
> > approach for the vsock CID space ?
> >
> > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
> > of CIDs which are exclusively referencing /dev/vhost-vsock associations
> > created outside the namespace. Anything not listed would be exclusively
> > referencing associations created inside the namespace.
> >
> > A more complex variant would be to allow a full remapping of CIDs as is
> > done with userns, via a /proc/net/vsock_cid_map, which the same three
> > parameters, so that CID=15 association outside the namespace could be
> > remapped to CID=9015 inside the namespace, allow the inside namespace
> > to define its out association for CID=15 without clashing.
> >
> > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
> > associations created outside namespace, while unmapped CIDs would be
> > exclusively referencing /dev/vhost-vsock associations inside the
> > namespace.
> >
> > A likely benefit of relying on a kernel defined mapping/partition of
> > the CID space is that apps like QEMU don't need changing, as there's
> > no need to invent a new /dev/vhost-vsock-netns device node.
> >
> > Both approaches give the desirable security protection whereby the
> > inside namespace can be prevented from accessing certain CIDs that
> > were associated outside the namespace.
> >
> > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
> > file as it is the security control mechanism. If it is write-once then
> > if the container mgmt app initializes it, nothing later could change
> > it.
> >
> > A key question is do we need the "first come, first served" behaviour
> > for CIDs where a CID can be arbitrarily used by outside or inside namespace
> > according to whatever tries to associate a CID first ?
>
> I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
> from being used, this could be solved by disallowing remapping the CID
> while in use?
>
> The thing I like about this is that users can check
> /proc/net/vsock_cid_outside to figure out what might be going on,
> instead of trying to check lsof or ps to figure out if the VMM processes
> have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
>
> Just to check I am following... I suppose we would have a few typical
> configurations for /proc/net/vsock_cid_outside. Following uid_map file
> format of:
> "<local cid start> <global cid start> <range size>"
>
> 1. Identity mapping, current namespace CID is global CID (default
> setting for new namespaces):
>
> # empty file
>
> OR
>
> 0 0 4294967295
>
> 2. Complete isolation from global space (initialized, but no mappings):
>
> 0 0 0
>
> 3. Mapping in ranges of global CIDs
>
> For example, global CID space starts at 7000, up to 32-bit max:
>
> 7000 0 4294960295
>
> Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
> 8000-8100) :
>
> 7000 0 100
> 8000 1000 100
>
>
> One thing I don't love is that option 3 seems to not be addressing a
> known use case. It doesn't necessarily hurt to have, but it will add
> complexity to CID handling that might never get used?
Yeah, I have the same feeling that full remapping of CIDs is probably
adding complexity without clear benefit, unless it somehow helps us
with the nested-virt scenario to disambiguate L0/L1/L2 CID ranges ?
I've not thought the latter through to any great level of detail
though
> Since options 1/2 could also be represented by a boolean (yes/no
> "current ns shares CID with global"), I wonder if we could either A)
> only support the first two options at first, or B) add just
> /proc/net/vsock_ns_mode at first, which supports only "global" and
> "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
> or the full mapping if the need arises?
Two options is sufficient if you want to control AF_VSOCK usage
and /dev/vhost-vsock usage as a pair. If you want to separately
control them though, it would push for three options - global,
local, and mixed. By mixed I mean AF_VSOCK in the NS can access
the global CID from the NS, but the NS can't associate the global
CID with a guest.
IOW, this breaks down like:
* CID=N local - aka fully private
Outside NS: Can associate outside CID=N with a guest.
AF_VSOCK permitted to access outside CID=N
Inside NS: Can NOT associate outside CID=N with a guest
Can associate inside CID=N with a guest
AF_VSOCK forbidden to access outside CID=N
AF_VSOCK permitted to access inside CID=N
* CID=N mixed - aka partially shared
Outside NS: Can associate outside CID=N with a guest.
AF_VSOCK permitted to access outside CID=N
Inside NS: Can NOT associate outside CID=N with a guest
AF_VSOCK permitted to access outside CID=N
No inside CID=N concept
* CID=N global - aka current historic behaviour
Outside NS: Can associate outside CID=N with a guest.
AF_VSOCK permitted to access outside CID=N
Inside NS: Can associate outside CID=N with a guest
AF_VSOCK permitted to access outside CID=N
No inside CID=N concept
I was thinking the 'mixed' mode might be useful if the outside NS wants
to retain control over setting up the association, but delegate to
processes in the inside NS for providing individual services to that
guest. This means if the outside NS needs to restart the VM, there is
no race window in which the inside NS can grab the assocaition with the
CID
As for whether we need to control this per-CID, or a single setting
applying to all CID.
Consider that the host OS can be running one or more "service VMs" on
well known CIDs that can be leveraged from other NS, while those other
NS also run some "end user VMs" that should be private to the NS.
IOW, the CIDs for the service VMs would need to be using "mixed"
policy, while the CIDs for the end user VMs would be "local".
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-04 13:05 ` Daniel P. Berrangé
@ 2025-04-18 17:57 ` Bobby Eshleman
2025-04-22 13:35 ` Stefano Garzarella
0 siblings, 1 reply; 42+ messages in thread
From: Bobby Eshleman @ 2025-04-18 17:57 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: Stefano Garzarella, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Fri, Apr 04, 2025 at 02:05:32PM +0100, Daniel P. Berrangé wrote:
> On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
> > On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
> > > It occured to me that the problem we face with the CID space usage is
> > > somewhat similar to the UID/GID space usage for user namespaces.
> > >
> > > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
> > > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
> > >
> > > At the risk of being overkill, is it worth trying a similar kind of
> > > approach for the vsock CID space ?
> > >
> > > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
> > > of CIDs which are exclusively referencing /dev/vhost-vsock associations
> > > created outside the namespace. Anything not listed would be exclusively
> > > referencing associations created inside the namespace.
> > >
> > > A more complex variant would be to allow a full remapping of CIDs as is
> > > done with userns, via a /proc/net/vsock_cid_map, which the same three
> > > parameters, so that CID=15 association outside the namespace could be
> > > remapped to CID=9015 inside the namespace, allow the inside namespace
> > > to define its out association for CID=15 without clashing.
> > >
> > > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
> > > associations created outside namespace, while unmapped CIDs would be
> > > exclusively referencing /dev/vhost-vsock associations inside the
> > > namespace.
> > >
> > > A likely benefit of relying on a kernel defined mapping/partition of
> > > the CID space is that apps like QEMU don't need changing, as there's
> > > no need to invent a new /dev/vhost-vsock-netns device node.
> > >
> > > Both approaches give the desirable security protection whereby the
> > > inside namespace can be prevented from accessing certain CIDs that
> > > were associated outside the namespace.
> > >
> > > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
> > > file as it is the security control mechanism. If it is write-once then
> > > if the container mgmt app initializes it, nothing later could change
> > > it.
> > >
> > > A key question is do we need the "first come, first served" behaviour
> > > for CIDs where a CID can be arbitrarily used by outside or inside namespace
> > > according to whatever tries to associate a CID first ?
> >
> > I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
> > from being used, this could be solved by disallowing remapping the CID
> > while in use?
> >
> > The thing I like about this is that users can check
> > /proc/net/vsock_cid_outside to figure out what might be going on,
> > instead of trying to check lsof or ps to figure out if the VMM processes
> > have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
> >
> > Just to check I am following... I suppose we would have a few typical
> > configurations for /proc/net/vsock_cid_outside. Following uid_map file
> > format of:
> > "<local cid start> <global cid start> <range size>"
> >
> > 1. Identity mapping, current namespace CID is global CID (default
> > setting for new namespaces):
> >
> > # empty file
> >
> > OR
> >
> > 0 0 4294967295
> >
> > 2. Complete isolation from global space (initialized, but no mappings):
> >
> > 0 0 0
> >
> > 3. Mapping in ranges of global CIDs
> >
> > For example, global CID space starts at 7000, up to 32-bit max:
> >
> > 7000 0 4294960295
> >
> > Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
> > 8000-8100) :
> >
> > 7000 0 100
> > 8000 1000 100
> >
> >
> > One thing I don't love is that option 3 seems to not be addressing a
> > known use case. It doesn't necessarily hurt to have, but it will add
> > complexity to CID handling that might never get used?
>
> Yeah, I have the same feeling that full remapping of CIDs is probably
> adding complexity without clear benefit, unless it somehow helps us
> with the nested-virt scenario to disambiguate L0/L1/L2 CID ranges ?
> I've not thought the latter through to any great level of detail
> though
>
> > Since options 1/2 could also be represented by a boolean (yes/no
> > "current ns shares CID with global"), I wonder if we could either A)
> > only support the first two options at first, or B) add just
> > /proc/net/vsock_ns_mode at first, which supports only "global" and
> > "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
> > or the full mapping if the need arises?
>
> Two options is sufficient if you want to control AF_VSOCK usage
> and /dev/vhost-vsock usage as a pair. If you want to separately
> control them though, it would push for three options - global,
> local, and mixed. By mixed I mean AF_VSOCK in the NS can access
> the global CID from the NS, but the NS can't associate the global
> CID with a guest.
>
> IOW, this breaks down like:
>
> * CID=N local - aka fully private
>
> Outside NS: Can associate outside CID=N with a guest.
> AF_VSOCK permitted to access outside CID=N
>
> Inside NS: Can NOT associate outside CID=N with a guest
> Can associate inside CID=N with a guest
> AF_VSOCK forbidden to access outside CID=N
> AF_VSOCK permitted to access inside CID=N
>
>
> * CID=N mixed - aka partially shared
>
> Outside NS: Can associate outside CID=N with a guest.
> AF_VSOCK permitted to access outside CID=N
>
> Inside NS: Can NOT associate outside CID=N with a guest
> AF_VSOCK permitted to access outside CID=N
> No inside CID=N concept
>
>
> * CID=N global - aka current historic behaviour
>
> Outside NS: Can associate outside CID=N with a guest.
> AF_VSOCK permitted to access outside CID=N
>
> Inside NS: Can associate outside CID=N with a guest
> AF_VSOCK permitted to access outside CID=N
> No inside CID=N concept
>
>
> I was thinking the 'mixed' mode might be useful if the outside NS wants
> to retain control over setting up the association, but delegate to
> processes in the inside NS for providing individual services to that
> guest. This means if the outside NS needs to restart the VM, there is
> no race window in which the inside NS can grab the assocaition with the
> CID
>
> As for whether we need to control this per-CID, or a single setting
> applying to all CID.
>
> Consider that the host OS can be running one or more "service VMs" on
> well known CIDs that can be leveraged from other NS, while those other
> NS also run some "end user VMs" that should be private to the NS.
>
> IOW, the CIDs for the service VMs would need to be using "mixed"
> policy, while the CIDs for the end user VMs would be "local".
>
I think this sounds pretty flexible, and IMO adding the third mode
doesn't add much more additional complexity.
Going this route, we have:
- three modes: local, global, mixed
- at first, no vsock_cid_map (local has no outside CIDs, global and mixed have no inside
CIDs, so no cross-mapping needed)
- only later add a full mapped mode and vsock_cid_map if necessary.
Stefano, any preferences on this vs starting with the restricted
vsock_cid_map (only supporting "0 0 0" and "0 0 <size>")?
I'm leaning towards the modes because it covers more use cases and seems
like a clearer user interface?
To clarify another aspect... child namespaces must inherit the parent's
local. So if namespace P sets the mode to local, and then creates a
child process that then creates namespace C... then C's global and mixed
modes are implicitly restricted to P's local space?
Thanks,
Bobby
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-18 17:57 ` Bobby Eshleman
@ 2025-04-22 13:35 ` Stefano Garzarella
0 siblings, 0 replies; 42+ messages in thread
From: Stefano Garzarella @ 2025-04-22 13:35 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Daniel P. Berrangé, Jakub Kicinski, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Stefan Hajnoczi,
Michael S. Tsirkin, Jason Wang, Xuan Zhuo, Eugenio Pérez,
Bryan Tan, Vishnu Dasa, Broadcom internal kernel review list,
David S. Miller, virtualization, netdev, linux-kernel,
linux-hyperv, kvm
On Fri, Apr 18, 2025 at 10:57:52AM -0700, Bobby Eshleman wrote:
>On Fri, Apr 04, 2025 at 02:05:32PM +0100, Daniel P. Berrangé wrote:
>> On Wed, Apr 02, 2025 at 03:18:13PM -0700, Bobby Eshleman wrote:
>> > On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
>> > > It occured to me that the problem we face with the CID space usage is
>> > > somewhat similar to the UID/GID space usage for user namespaces.
>> > >
>> > > In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
>> > > allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
>> > >
>> > > At the risk of being overkill, is it worth trying a similar kind of
>> > > approach for the vsock CID space ?
>> > >
>> > > A simple variant would be a /proc/net/vsock_cid_outside specifying a set
>> > > of CIDs which are exclusively referencing /dev/vhost-vsock associations
>> > > created outside the namespace. Anything not listed would be exclusively
>> > > referencing associations created inside the namespace.
>> > >
>> > > A more complex variant would be to allow a full remapping of CIDs as is
>> > > done with userns, via a /proc/net/vsock_cid_map, which the same three
>> > > parameters, so that CID=15 association outside the namespace could be
>> > > remapped to CID=9015 inside the namespace, allow the inside namespace
>> > > to define its out association for CID=15 without clashing.
>> > >
>> > > IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
>> > > associations created outside namespace, while unmapped CIDs would be
>> > > exclusively referencing /dev/vhost-vsock associations inside the
>> > > namespace.
>> > >
>> > > A likely benefit of relying on a kernel defined mapping/partition of
>> > > the CID space is that apps like QEMU don't need changing, as there's
>> > > no need to invent a new /dev/vhost-vsock-netns device node.
>> > >
>> > > Both approaches give the desirable security protection whereby the
>> > > inside namespace can be prevented from accessing certain CIDs that
>> > > were associated outside the namespace.
>> > >
>> > > Some rule would need to be defined for updating the /proc/net/vsock_cid_map
>> > > file as it is the security control mechanism. If it is write-once then
>> > > if the container mgmt app initializes it, nothing later could change
>> > > it.
>> > >
>> > > A key question is do we need the "first come, first served" behaviour
>> > > for CIDs where a CID can be arbitrarily used by outside or inside namespace
>> > > according to whatever tries to associate a CID first ?
>> >
>> > I think with /proc/net/vsock_cid_outside, instead of disallowing the CID
>> > from being used, this could be solved by disallowing remapping the CID
>> > while in use?
>> >
>> > The thing I like about this is that users can check
>> > /proc/net/vsock_cid_outside to figure out what might be going on,
>> > instead of trying to check lsof or ps to figure out if the VMM processes
>> > have used /dev/vhost-vsock vs /dev/vhost-vsock-netns.
>> >
>> > Just to check I am following... I suppose we would have a few typical
>> > configurations for /proc/net/vsock_cid_outside. Following uid_map file
>> > format of:
>> > "<local cid start> <global cid start> <range size>"
>> >
>> > 1. Identity mapping, current namespace CID is global CID (default
>> > setting for new namespaces):
>> >
>> > # empty file
>> >
>> > OR
>> >
>> > 0 0 4294967295
>> >
>> > 2. Complete isolation from global space (initialized, but no mappings):
>> >
>> > 0 0 0
>> >
>> > 3. Mapping in ranges of global CIDs
>> >
>> > For example, global CID space starts at 7000, up to 32-bit max:
>> >
>> > 7000 0 4294960295
>> >
>> > Or for multiple mappings (0-100 map to 7000-7100, 1000-1100 map to
>> > 8000-8100) :
>> >
>> > 7000 0 100
>> > 8000 1000 100
>> >
>> >
>> > One thing I don't love is that option 3 seems to not be addressing a
>> > known use case. It doesn't necessarily hurt to have, but it will add
>> > complexity to CID handling that might never get used?
>>
>> Yeah, I have the same feeling that full remapping of CIDs is probably
>> adding complexity without clear benefit, unless it somehow helps us
>> with the nested-virt scenario to disambiguate L0/L1/L2 CID ranges ?
>> I've not thought the latter through to any great level of detail
>> though
>>
>> > Since options 1/2 could also be represented by a boolean (yes/no
>> > "current ns shares CID with global"), I wonder if we could either A)
>> > only support the first two options at first, or B) add just
>> > /proc/net/vsock_ns_mode at first, which supports only "global" and
>> > "local", and later add a "mapped" mode plus /proc/net/vsock_cid_outside
>> > or the full mapping if the need arises?
>>
>> Two options is sufficient if you want to control AF_VSOCK usage
>> and /dev/vhost-vsock usage as a pair. If you want to separately
>> control them though, it would push for three options - global,
>> local, and mixed. By mixed I mean AF_VSOCK in the NS can access
>> the global CID from the NS, but the NS can't associate the global
>> CID with a guest.
>>
>> IOW, this breaks down like:
>>
>> * CID=N local - aka fully private
>>
>> Outside NS: Can associate outside CID=N with a guest.
>> AF_VSOCK permitted to access outside CID=N
>>
>> Inside NS: Can NOT associate outside CID=N with a guest
>> Can associate inside CID=N with a guest
>> AF_VSOCK forbidden to access outside CID=N
>> AF_VSOCK permitted to access inside CID=N
>>
>>
>> * CID=N mixed - aka partially shared
>>
>> Outside NS: Can associate outside CID=N with a guest.
>> AF_VSOCK permitted to access outside CID=N
>>
>> Inside NS: Can NOT associate outside CID=N with a guest
>> AF_VSOCK permitted to access outside CID=N
>> No inside CID=N concept
>>
>>
>> * CID=N global - aka current historic behaviour
>>
>> Outside NS: Can associate outside CID=N with a guest.
>> AF_VSOCK permitted to access outside CID=N
>>
>> Inside NS: Can associate outside CID=N with a guest
>> AF_VSOCK permitted to access outside CID=N
>> No inside CID=N concept
>>
>>
>> I was thinking the 'mixed' mode might be useful if the outside NS wants
>> to retain control over setting up the association, but delegate to
>> processes in the inside NS for providing individual services to that
>> guest. This means if the outside NS needs to restart the VM, there is
>> no race window in which the inside NS can grab the assocaition with the
>> CID
>>
>> As for whether we need to control this per-CID, or a single setting
>> applying to all CID.
>>
>> Consider that the host OS can be running one or more "service VMs" on
>> well known CIDs that can be leveraged from other NS, while those other
>> NS also run some "end user VMs" that should be private to the NS.
>>
>> IOW, the CIDs for the service VMs would need to be using "mixed"
>> policy, while the CIDs for the end user VMs would be "local".
>>
>
>I think this sounds pretty flexible, and IMO adding the third mode
>doesn't add much more additional complexity.
>
>Going this route, we have:
>- three modes: local, global, mixed
>- at first, no vsock_cid_map (local has no outside CIDs, global and mixed have no inside
> CIDs, so no cross-mapping needed)
>- only later add a full mapped mode and vsock_cid_map if necessary.
>
>Stefano, any preferences on this vs starting with the restricted
>vsock_cid_map (only supporting "0 0 0" and "0 0 <size>")?
No preference, I also like this idea.
>
>I'm leaning towards the modes because it covers more use cases and seems
>like a clearer user interface?
Sure, go head!
>
>To clarify another aspect... child namespaces must inherit the parent's
>local. So if namespace P sets the mode to local, and then creates a
>child process that then creates namespace C... then C's global and mixed
>modes are implicitly restricted to P's local space?
I think so, but it's still not clear to me if the mode can be selected
per namespace or it's a setting for the entire system, but I think we
can discuss this better on a proposal with some code :-)
Thanks,
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 0/3] vsock: add namespace support to vhost-vsock
2025-04-02 9:21 ` Daniel P. Berrangé
2025-04-02 22:18 ` Bobby Eshleman
@ 2025-04-03 9:01 ` Stefano Garzarella
1 sibling, 0 replies; 42+ messages in thread
From: Stefano Garzarella @ 2025-04-03 9:01 UTC (permalink / raw)
To: Daniel P. Berrangé
Cc: Bobby Eshleman, Jakub Kicinski, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Stefan Hajnoczi, Michael S. Tsirkin,
Jason Wang, Xuan Zhuo, Eugenio Pérez, Bryan Tan, Vishnu Dasa,
Broadcom internal kernel review list, David S. Miller,
virtualization, netdev, linux-kernel, linux-hyperv, kvm
On Wed, Apr 02, 2025 at 10:21:36AM +0100, Daniel P. Berrangé wrote:
>On Wed, Apr 02, 2025 at 10:13:43AM +0200, Stefano Garzarella wrote:
>> On Wed, 2 Apr 2025 at 02:21, Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
>> >
>> > I do like Stefano's suggestion to add a sysctl for a "strict" mode,
>> > Since it offers the best of both worlds, and still tends conservative in
>> > protecting existing applications... but I agree, the non-strict mode
>> > vsock would be unique WRT the usual concept of namespaces.
>>
>> Maybe we could do the opposite, enable strict mode by default (I think
>> it was similar to what I had tried to do with the kernel module in v1, I
>> was young I know xD)
>> And provide a way to disable it for those use cases where the user wants
>> backward compatibility, while paying the cost of less isolation.
>
>I think backwards compatible has to be the default behaviour, otherwise
>the change has too high risk of breaking existing deployments that are
>already using netns and relying on VSOCK being global. Breakage has to
>be opt in.
>
>> I was thinking two options (not sure if the second one can be done):
>>
>> 1. provide a global sysfs/sysctl that disables strict mode, but this
>> then applies to all namespaces
>>
>> 2. provide something that allows disabling strict mode by namespace.
>> Maybe when it is created there are options, or something that can be
>> set later.
>>
>> 2 would be ideal, but that might be too much, so 1 might be enough. In
>> any case, 2 could also be a next step.
>>
>> WDYT?
>
>It occured to me that the problem we face with the CID space usage is
>somewhat similar to the UID/GID space usage for user namespaces.
>
>In the latter case, userns has exposed /proc/$PID/uid_map & gid_map, to
>allow IDs in the namespace to be arbitrarily mapped onto IDs in the host.
>
>At the risk of being overkill, is it worth trying a similar kind of
>approach for the vsock CID space ?
>
>A simple variant would be a /proc/net/vsock_cid_outside specifying a set
>of CIDs which are exclusively referencing /dev/vhost-vsock associations
>created outside the namespace. Anything not listed would be exclusively
>referencing associations created inside the namespace.
I like the idea and I think it is also easily usable in a nested
environment, where for example in L1 we can decide whether or not a
namespace can access the L0 host (CID=2), by adding 2 to
/proc/net/vsock_cid_outside
>
>A more complex variant would be to allow a full remapping of CIDs as is
>done with userns, via a /proc/net/vsock_cid_map, which the same three
>parameters, so that CID=15 association outside the namespace could be
>remapped to CID=9015 inside the namespace, allow the inside namespace
>to define its out association for CID=15 without clashing.
>
>IOW, mapped CIDs would be exclusively referencing /dev/vhost-vsock
>associations created outside namespace, while unmapped CIDs would be
>exclusively referencing /dev/vhost-vsock associations inside the
>namespace.
This is maybe a little overkill, but I don't object to it!
It could also be a next step. But if it's easy to implement, we can go
straight with it.
>
>A likely benefit of relying on a kernel defined mapping/partition of
>the CID space is that apps like QEMU don't need changing, as there's
>no need to invent a new /dev/vhost-vsock-netns device node.
Yeah, I see that!
However, should this be paired with a sysctl/sysfs to do opt-in?
Or can we do something to figure out if the user didn't write these
files, then behave as before (but maybe we need to reverse the logic, I
don't know if that makes sense).
>
>Both approaches give the desirable security protection whereby the
>inside namespace can be prevented from accessing certain CIDs that
>were associated outside the namespace.
>
>Some rule would need to be defined for updating the /proc/net/vsock_cid_map
>file as it is the security control mechanism. If it is write-once then
>if the container mgmt app initializes it, nothing later could change
>it.
>
>A key question is do we need the "first come, first served" behaviour
>for CIDs where a CID can be arbitrarily used by outside or inside namespace
>according to whatever tries to associate a CID first ?
>
>IMHO those semantics lead to unpredictable behaviour for apps because
>what happens depends on ordering of app launches inside & outside the
>namespace, but they do sort of allow for VSOCK namespace behaviour to
>be 'zero conf' out of the box.
Yes, I agree that we should avoid it if possible.
>
>A mapping that strictly partitions CIDs to either outside or inside
>namespace usage, but never both, gives well defined behaviour, at the
>cost of needing to setup an initial mapping/partition.
Thanks for your points!
Stefano
^ permalink raw reply [flat|nested] 42+ messages in thread