[RFC 3/3] vhost/net: add RX netfilter offload path

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

From: Cindy Lu <lulu@redhat.com>
To: lulu@redhat.com, mst@redhat.com, jasowang@redhat.com,
	kvm@vger.kernel.org, virtualization@lists.linux.dev,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC 3/3] vhost/net: add RX netfilter offload path
Date: Sun,  8 Feb 2026 22:32:24 +0800	[thread overview]
Message-ID: <20260208143441.2177372-4-lulu@redhat.com> (raw)
In-Reply-To: <20260208143441.2177372-1-lulu@redhat.com>

Route RX packets through the netfilter socket when configured.
Key points:
- Add VHOST_NET_FILTER_MAX_LEN upper bound for filter payload size
- Introduce vhost_net_filter_request() to send REQUEST to userspace
- Add handle_rx_filter() fast path for RX when filter is active
- Hook filter path in handle_rx() when filter_sock is set

Signed-off-by: Cindy Lu <lulu@redhat.com>
---
 drivers/vhost/net.c | 229 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f02deff0e53c..aa9a5ed43eae 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -161,6 +161,13 @@ struct vhost_net {
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
 
+/*
+ * Upper bound for a single packet payload on the filter path.
+ * Keep this large enough for the largest expected frame plus vnet headers,
+ * but still bounded to avoid unbounded allocations.
+ */
+#define VHOST_NET_FILTER_MAX_LEN (4096 + 65536)
+
 static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
 {
 	if (rxq->tail != rxq->head)
@@ -1227,6 +1234,222 @@ static long vhost_net_set_filter(struct vhost_net *n, int fd)
 	return r;
 }
 
+/*
+ * Send a filter REQUEST message to userspace for a single packet.
+ *
+ * The caller provides a writable buffer; userspace may inspect the content and
+ * optionally modify it in place. We only accept the packet if the returned
+ * length matches the original length, otherwise the packet is dropped.
+ */
+static int vhost_net_filter_request(struct vhost_net *n, u16 direction,
+				    void *buf, u32 *len)
+{
+	struct vhost_net_filter_msg msg = {
+		.type = VHOST_NET_FILTER_MSG_REQUEST,
+		.direction = direction,
+		.len = *len,
+	};
+	struct msghdr msghdr = { 0 };
+	struct kvec iov[2] = {
+		{ .iov_base = &msg, .iov_len = sizeof(msg) },
+		{ .iov_base = buf, .iov_len = *len },
+	};
+	struct socket *sock;
+	struct file *sock_file = NULL;
+	int ret;
+
+	/*
+	 * Take a temporary file reference to guard against concurrent
+	 * filter socket replacement while we send the message.
+	 */
+	spin_lock(&n->filter_lock);
+	sock = n->filter_sock;
+	if (sock)
+		sock_file = get_file(sock->file);
+	spin_unlock(&n->filter_lock);
+
+	if (!sock) {
+		ret = -ENOTCONN;
+		goto out_put;
+	}
+
+	ret = kernel_sendmsg(sock, &msghdr, iov,
+			     *len ? 2 : 1, sizeof(msg) + *len);
+
+out_put:
+	if (sock_file)
+		fput(sock_file);
+
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/*
+ * RX fast path when filter offload is active.
+ *
+ * This mirrors handle_rx() but routes each RX packet through userspace
+ * netfilter. Packets are copied into a temporary buffer, sent to the filter
+ * socket as a REQUEST, and only delivered to the guest if userspace keeps the
+ * length unchanged. Any truncation or mismatch drops the packet.
+ */
+static void handle_rx_filter(struct vhost_net *net, struct socket *sock)
+{
+	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_virtqueue *vq = &nvq->vq;
+	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+	unsigned int count = 0;
+	unsigned int in, log;
+	struct vhost_log *vq_log;
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	};
+	struct msghdr msg = {
+		.msg_name = NULL,
+		.msg_namelen = 0,
+		.msg_control = NULL,
+		.msg_controllen = 0,
+		.msg_flags = MSG_DONTWAIT,
+	};
+	size_t total_len = 0;
+	int mergeable;
+	bool set_num_buffers;
+	size_t vhost_hlen, sock_hlen;
+	size_t vhost_len, sock_len;
+	bool busyloop_intr = false;
+	struct iov_iter fixup;
+	__virtio16 num_buffers;
+	int recv_pkts = 0;
+	unsigned int ndesc;
+	void *pkt;
+
+	pkt = kvmalloc(VHOST_NET_FILTER_MAX_LEN, GFP_KERNEL | __GFP_NOWARN);
+	if (!pkt) {
+		vhost_net_enable_vq(net, vq);
+		return;
+	}
+
+	vhost_hlen = nvq->vhost_hlen;
+	sock_hlen = nvq->sock_hlen;
+
+	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+	set_num_buffers = mergeable || vhost_has_feature(vq, VIRTIO_F_VERSION_1);
+
+	do {
+		u32 pkt_len;
+		int err;
+		s16 headcount;
+		struct kvec iov;
+
+		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+						      &busyloop_intr, &count);
+		if (!sock_len)
+			break;
+		sock_len += sock_hlen;
+		if (sock_len > VHOST_NET_FILTER_MAX_LEN) {
+			/* Consume and drop oversized packet. */
+			iov.iov_base = pkt;
+			iov.iov_len = 1;
+			kernel_recvmsg(sock, &msg, &iov, 1, 1,
+				       MSG_DONTWAIT | MSG_TRUNC);
+			continue;
+		}
+
+		vhost_len = sock_len + vhost_hlen;
+		headcount = get_rx_bufs(nvq, vq->heads + count,
+					vq->nheads + count, vhost_len, &in,
+					vq_log, &log,
+					likely(mergeable) ? UIO_MAXIOV : 1,
+					&ndesc);
+		if (unlikely(headcount < 0))
+			goto out;
+
+		if (!headcount) {
+			if (unlikely(busyloop_intr)) {
+				vhost_poll_queue(&vq->poll);
+			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+				vhost_disable_notify(&net->dev, vq);
+				continue;
+			}
+			goto out;
+		}
+
+		busyloop_intr = false;
+
+		if (nvq->rx_ring)
+			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
+
+		iov.iov_base = pkt;
+		iov.iov_len = sock_len;
+		err = kernel_recvmsg(sock, &msg, &iov, 1, sock_len,
+				     MSG_DONTWAIT | MSG_TRUNC);
+		if (unlikely(err != sock_len)) {
+			vhost_discard_vq_desc(vq, headcount, ndesc);
+			continue;
+		}
+
+		pkt_len = sock_len;
+		err = vhost_net_filter_request(net, VHOST_NET_FILTER_DIRECTION_TX,
+					       pkt, &pkt_len);
+		if (err < 0)
+			pkt_len = sock_len;
+		if (pkt_len != sock_len) {
+			vhost_discard_vq_desc(vq, headcount, ndesc);
+			continue;
+		}
+
+		iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len);
+		fixup = msg.msg_iter;
+		if (unlikely(vhost_hlen))
+			iov_iter_advance(&msg.msg_iter, vhost_hlen);
+
+		if (copy_to_iter(pkt, sock_len, &msg.msg_iter) != sock_len) {
+			vhost_discard_vq_desc(vq, headcount, ndesc);
+			goto out;
+		}
+
+		if (unlikely(vhost_hlen)) {
+			if (copy_to_iter(&hdr, sizeof(hdr),
+					 &fixup) != sizeof(hdr)) {
+				vhost_discard_vq_desc(vq, headcount, ndesc);
+				goto out;
+			}
+		} else {
+			iov_iter_advance(&fixup, sizeof(hdr));
+		}
+
+		num_buffers = cpu_to_vhost16(vq, headcount);
+		if (likely(set_num_buffers) &&
+		    copy_to_iter(&num_buffers, sizeof(num_buffers), &fixup) !=
+			    sizeof(num_buffers)) {
+			vhost_discard_vq_desc(vq, headcount, ndesc);
+			goto out;
+		}
+
+		nvq->done_idx += headcount;
+		count += in_order ? 1 : headcount;
+		if (nvq->done_idx > VHOST_NET_BATCH) {
+			vhost_net_signal_used(nvq, count);
+			count = 0;
+		}
+
+		if (unlikely(vq_log))
+			vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in);
+
+		total_len += vhost_len;
+	} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
+
+	if (unlikely(busyloop_intr))
+		vhost_poll_queue(&vq->poll);
+	else if (!sock_len)
+		vhost_net_enable_vq(net, vq);
+
+out:
+	vhost_net_signal_used(nvq, count);
+	kvfree(pkt);
+}
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -1281,6 +1504,11 @@ static void handle_rx(struct vhost_net *net)
 	set_num_buffers = mergeable ||
 			  vhost_has_feature(vq, VIRTIO_F_VERSION_1);
 
+	if (READ_ONCE(net->filter_sock)) {
+		handle_rx_filter(net, sock);
+		goto out_unlock;
+	}
+
 	do {
 		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
 						      &busyloop_intr, &count);
@@ -1383,6 +1611,7 @@ static void handle_rx(struct vhost_net *net)
 		vhost_net_enable_vq(net, vq);
 out:
 	vhost_net_signal_used(nvq, count);
+out_unlock:
 	mutex_unlock(&vq->mutex);
 }
 
-- 
2.52.0

next prev parent reply	other threads:[~2026-02-08 14:35 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-08 14:32 [RFC 0/3] vhost-net: netfilter support for RX path Cindy Lu
2026-02-08 14:32 ` [RFC 1/3] uapi: vhost: add vhost-net netfilter offload API Cindy Lu
2026-02-08 17:32   ` Michael S. Tsirkin
2026-02-09  6:41     ` Cindy Lu
2026-02-08 14:32 ` [RFC 2/3] vhost/net: add netfilter socket support Cindy Lu
2026-02-08 14:32 ` Cindy Lu [this message]
2026-02-09  1:46 ` [RFC 0/3] vhost-net: netfilter support for RX path Jason Wang

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:f02deff0e53 dfblob:aa9a5ed43ea )
 OR (
bs:"[RFC 3/3] vhost/net: add RX netfilter offload path" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260208143441.2177372-4-lulu@redhat.com \
    --to=lulu@redhat.com \
    --cc=jasowang@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=virtualization@lists.linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox