public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Stefano Garzarella <sgarzare@redhat.com>
To: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Cc: Stefan Hajnoczi <stefanha@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	Jason Wang <jasowang@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	"edumazet@google.com" <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Krasnov Arseniy <oxffffaa@gmail.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"virtualization@lists.linux-foundation.org" 
	<virtualization@lists.linux-foundation.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	kernel <kernel@sberdevices.ru>
Subject: Re: [RFC PATCH v3 03/11] af_vsock: add zerocopy receive logic
Date: Fri, 11 Nov 2022 14:55:49 +0100	[thread overview]
Message-ID: <20221111135549.2fqufprbc3muedmr@sgarzare-redhat> (raw)
In-Reply-To: <7aeba781-db09-9be1-a9a3-a4c16da38fb5@sberdevices.ru>

On Sun, Nov 06, 2022 at 07:40:12PM +0000, Arseniy Krasnov wrote:
>This:
>1) Adds callback for 'mmap()' call on socket. It checks vm area flags
>   and sets vm area ops.
>2) Adds special 'getsockopt()' case which calls transport zerocopy
>   callback. Input argument is vm area address.
>3) Adds 'getsockopt()/setsockopt()' for switching on/off rx zerocopy
>   mode.
>
>Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>---
> include/net/af_vsock.h          |   8 ++
> include/uapi/linux/vm_sockets.h |   3 +
> net/vmw_vsock/af_vsock.c        | 187 +++++++++++++++++++++++++++++++-
> 3 files changed, 196 insertions(+), 2 deletions(-)
>
>diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>index 568a87c5e0d0..e4f12ef8e623 100644
>--- a/include/net/af_vsock.h
>+++ b/include/net/af_vsock.h
>@@ -73,6 +73,8 @@ struct vsock_sock {
>
> 	/* Private to transport. */
> 	void *trans;
>+
>+	bool rx_zerocopy_on;

Maybe better to leave the last fields the private ones to transports, so 
I would say put it before trans;

> };
>
> s64 vsock_stream_has_data(struct vsock_sock *vsk);
>@@ -138,6 +140,12 @@ struct vsock_transport {
> 	bool (*stream_allow)(u32 cid, u32 port);
> 	int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>
>+	int (*zerocopy_rx_mmap)(struct vsock_sock *vsk,
>+				struct vm_area_struct *vma);
>+	int (*zerocopy_dequeue)(struct vsock_sock *vsk,
>+				struct page **pages,
>+				unsigned long *pages_num);
>+
> 	/* SEQ_PACKET. */
> 	ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
> 				     int flags);
>diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
>index c60ca33eac59..d1f792bed1a7 100644
>--- a/include/uapi/linux/vm_sockets.h
>+++ b/include/uapi/linux/vm_sockets.h
>@@ -83,6 +83,9 @@
>
> #define SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW 8
>
>+#define SO_VM_SOCKETS_MAP_RX 9
>+#define SO_VM_SOCKETS_ZEROCOPY 10

Before removing RFC, we should document these macros because they are 
exposed to the user.

>+
> #if !defined(__KERNEL__)
> #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && 
> defined(__ILP32__))
> #define SO_VM_SOCKETS_CONNECT_TIMEOUT SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD
>diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>index ee418701cdee..21a915eb0820 100644
>--- a/net/vmw_vsock/af_vsock.c
>+++ b/net/vmw_vsock/af_vsock.c
>@@ -1663,6 +1663,16 @@ static int vsock_connectible_setsockopt(struct socket *sock,
> 		}
> 		break;
> 	}
>+	case SO_VM_SOCKETS_ZEROCOPY: {
>+		if (sock->state != SS_UNCONNECTED) {
>+			err = -EOPNOTSUPP;
>+			break;
>+		}
>+
>+		COPY_IN(val);
>+		vsk->rx_zerocopy_on = val;
>+		break;
>+	}
>
> 	default:
> 		err = -ENOPROTOOPT;
>@@ -1676,6 +1686,124 @@ static int vsock_connectible_setsockopt(struct socket *sock,
> 	return err;
> }
>
>+static const struct vm_operations_struct afvsock_vm_ops = {
>+};
>+
>+static int vsock_recv_zerocopy(struct socket *sock,
>+			       unsigned long address)
>+{
>+	const struct vsock_transport *transport;
>+	struct vm_area_struct *vma;
>+	unsigned long vma_pages;
>+	struct vsock_sock *vsk;
>+	struct page **pages;
>+	struct sock *sk;
>+	int err;
>+	int i;
>+
>+	sk = sock->sk;
>+	vsk = vsock_sk(sk);
>+	err = 0;
>+
>+	lock_sock(sk);
>+
>+	if (!vsk->rx_zerocopy_on) {
>+		err = -EOPNOTSUPP;
>+		goto out_unlock_sock;
>+	}
>+
>+	transport = vsk->transport;
>+
>+	if (!transport->zerocopy_dequeue) {
>+		err = -EOPNOTSUPP;
>+		goto out_unlock_sock;
>+	}
>+
>+	mmap_write_lock(current->mm);
>+
>+	vma = vma_lookup(current->mm, address);
>+
>+	if (!vma || vma->vm_ops != &afvsock_vm_ops) {
>+		err = -EINVAL;
>+		goto out_unlock_vma;
>+	}
>+
>+	/* Allow to use vm area only from the first page. */
>+	if (vma->vm_start != address) {
>+		err = -EINVAL;
>+		goto out_unlock_vma;
>+	}
>+
>+	vma_pages = (vma->vm_end - vma->vm_start) / PAGE_SIZE;
>+	pages = kmalloc_array(vma_pages, sizeof(pages[0]),
>+			      GFP_KERNEL | __GFP_ZERO);
>+
>+	if (!pages) {
>+		err = -EINVAL;
>+		goto out_unlock_vma;
>+	}
>+
>+	err = transport->zerocopy_dequeue(vsk, pages, &vma_pages);
>+
>+	if (err)
>+		goto out_unlock_vma;
>+
>+	/* Now 'vma_pages' contains number of pages in array.
>+	 * If array element is NULL, skip it, go to next page.
>+	 */
>+	for (i = 0; i < vma_pages; i++) {
>+		if (pages[i]) {
>+			unsigned long pages_inserted;
>+
>+			pages_inserted = 1;
>+			err = vm_insert_pages(vma, address, &pages[i], &pages_inserted);
>+
>+			if (err || pages_inserted) {
>+				/* Failed to insert some pages, we have "partially"
>+				 * mapped vma. Do not return, set error code. This
>+				 * code will be returned to user. User needs to call
>+				 * 'madvise()/mmap()' to clear this vma. Anyway,
>+				 * references to all pages will to be dropped below.
>+				 */
>+				if (!err) {
>+					err = -EFAULT;
>+					break;
>+				}
>+			}
>+		}
>+
>+		address += PAGE_SIZE;
>+	}
>+
>+	i = 0;
>+
>+	while (i < vma_pages) {
>+		/* Drop ref count for all pages, returned by transport.
>+		 * We call 'put_page()' only once, as transport needed
>+		 * to 'get_page()' at least only once also, to prevent
>+		 * pages being freed. If transport calls 'get_page()'
>+		 * more twice or more for every page - we don't care,
>+		 * if transport calls 'get_page()' only one time, this
>+		 * meanse that every page had ref count equal to 1,then
>+		 * 'vm_insert_pages()' increments it to 2. After this
>+		 * loop, ref count will be 1 again, and page will be
>+		 * returned to allocator by user.
>+		 */
>+		if (pages[i])
>+			put_page(pages[i]);
>+		i++;
>+	}
>+
>+	kfree(pages);
>+
>+out_unlock_vma:
>+	mmap_write_unlock(current->mm);
>+out_unlock_sock:
>+	release_sock(sk);
>+
>+	return err;
>+}
>+
> static int vsock_connectible_getsockopt(struct socket *sock,
> 					int level, int optname,
> 					char __user *optval,
>@@ -1720,6 +1848,26 @@ static int vsock_connectible_getsockopt(struct socket *sock,
> 		lv = sock_get_timeout(vsk->connect_timeout, &v,
> 				      optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD);
> 		break;
>+	case SO_VM_SOCKETS_ZEROCOPY: {
>+		lock_sock(sk);
>+
>+		v.val64 = vsk->rx_zerocopy_on;
>+
>+		release_sock(sk);
>+
>+		break;
>+	}
>+	case SO_VM_SOCKETS_MAP_RX: {
>+		unsigned long vma_addr;
>+
>+		if (len < sizeof(vma_addr))
>+			return -EINVAL;
>+
>+		if (copy_from_user(&vma_addr, optval, sizeof(vma_addr)))
>+			return -EFAULT;
>+
>+		return vsock_recv_zerocopy(sock, vma_addr);
>+	}
>
> 	default:
> 		return -ENOPROTOOPT;
>@@ -2167,6 +2315,41 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
> 	return 0;
> }
>
>+static int afvsock_mmap(struct file *file, struct socket *sock,
>+			struct vm_area_struct *vma)
>+{
>+	const struct vsock_transport *transport;
>+	struct vsock_sock *vsk;
>+	struct sock *sk;
>+	int err;
>+
>+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
>+		return -EPERM;
>+
>+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
>+	vma->vm_flags |= (VM_MIXEDMAP);
>+	vma->vm_ops = &afvsock_vm_ops;
>+
>+	sk = sock->sk;
>+	vsk = vsock_sk(sk);
>+
>+	lock_sock(sk);
>+
>+	transport = vsk->transport;
>+
>+	if (!transport || !transport->zerocopy_rx_mmap) {
>+		err = -EOPNOTSUPP;
>+		goto out_unlock;
>+	}
>+
>+	err = transport->zerocopy_rx_mmap(vsk, vma);
>+
>+out_unlock:
>+	release_sock(sk);
>+
>+	return err;
>+}
>+
> static const struct proto_ops vsock_stream_ops = {
> 	.family = PF_VSOCK,
> 	.owner = THIS_MODULE,
>@@ -2184,7 +2367,7 @@ static const struct proto_ops vsock_stream_ops = {
> 	.getsockopt = vsock_connectible_getsockopt,
> 	.sendmsg = vsock_connectible_sendmsg,
> 	.recvmsg = vsock_connectible_recvmsg,
>-	.mmap = sock_no_mmap,
>+	.mmap = afvsock_mmap,
> 	.sendpage = sock_no_sendpage,
> 	.set_rcvlowat = vsock_set_rcvlowat,
> };
>@@ -2206,7 +2389,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
> 	.getsockopt = vsock_connectible_getsockopt,
> 	.sendmsg = vsock_connectible_sendmsg,
> 	.recvmsg = vsock_connectible_recvmsg,
>-	.mmap = sock_no_mmap,
>+	.mmap = afvsock_mmap,
> 	.sendpage = sock_no_sendpage,
> };
>
>-- 
>2.35.0


  reply	other threads:[~2022-11-11 14:01 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-06 19:33 [RFC PATCH v3 00/11] virtio/vsock: experimental zerocopy receive Arseniy Krasnov
2022-11-06 19:36 ` [RFC PATCH v3 01/11] virtio/vsock: rework packet allocation logic Arseniy Krasnov
2022-11-06 19:50   ` Christophe JAILLET
2022-11-07  5:23     ` Arseniy Krasnov
2022-11-07 21:24       ` Christophe JAILLET
2022-11-07 21:31         ` Arseniy Krasnov
2022-11-11 20:35   ` Bobby Eshleman
2022-11-06 19:38 ` [RFC PATCH v3 02/11] virtio/vsock: update, 'virtio_transport_recv_pkt()' Arseniy Krasnov
2022-11-06 19:40 ` [RFC PATCH v3 03/11] af_vsock: add zerocopy receive logic Arseniy Krasnov
2022-11-11 13:55   ` Stefano Garzarella [this message]
2022-11-06 19:41 ` [RFC PATCH v3 04/11] virtio/vsock: add transport zerocopy callback Arseniy Krasnov
2022-11-10 11:15   ` Arseniy Krasnov
2022-11-06 19:43 ` [RFC PATCH v3 05/11] vhost/vsock: switch packet's buffer allocation Arseniy Krasnov
2022-11-06 19:45 ` [RFC PATCH v3 06/11] vhost/vsock: enable zerocopy callback Arseniy Krasnov
2022-11-06 19:47 ` [RFC PATCH v3 07/11] virtio/vsock: " Arseniy Krasnov
2022-11-06 19:48 ` [RFC PATCH v3 08/11] test/vsock: rework message bound test Arseniy Krasnov
2022-11-11 14:00   ` Stefano Garzarella
2022-11-06 19:50 ` [RFC PATCH v3 09/11] test/vsock: add big message test Arseniy Krasnov
2022-11-06 19:51 ` [RFC PATCH v3 10/11] test/vsock: add receive zerocopy tests Arseniy Krasnov
2022-11-06 19:54 ` [RFC PATCH v3 11/11] test/vsock: vsock_rx_perf utility Arseniy Krasnov
2022-11-11 13:47 ` [RFC PATCH v3 00/11] virtio/vsock: experimental zerocopy receive Stefano Garzarella
2022-11-11 14:06   ` Stefano Garzarella
2022-11-11 18:35     ` Arseniy Krasnov
2022-11-11 20:45   ` Bobby Eshleman
2022-11-12 11:40     ` Arseniy Krasnov
2022-11-13 10:04       ` Arseniy Krasnov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221111135549.2fqufprbc3muedmr@sgarzare-redhat \
    --to=sgarzare@redhat.com \
    --cc=AVKrasnov@sberdevices.ru \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=jasowang@redhat.com \
    --cc=kernel@sberdevices.ru \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=oxffffaa@gmail.com \
    --cc=pabeni@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox