Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* [PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

There's no need to duplicate page get logic in each action. So this
patch tries to get page and calculate the offset before processing XDP
actions, and undo them when meet errors (we don't care the performance
on errors). This will be used for factoring out XDP logic.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 372caf7d67d9..f8cdcfa392c3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1642,7 +1642,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     int len, int *skb_xdp)
 {
 	struct page_frag *alloc_frag = &current->task_frag;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	unsigned int delta = 0;
@@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	if (copied != len)
 		return ERR_PTR(-EFAULT);
 
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
 	/* There's a small window that XDP may be set after the check
 	 * of xdp_prog above, this should be rare and for simplicity
 	 * we do XDP on skb in case the headroom is not enough.
@@ -1695,23 +1698,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		switch (act) {
 		case XDP_REDIRECT:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			xdp_do_flush_map();
 			if (err)
-				goto err_redirect;
-			rcu_read_unlock();
-			local_bh_enable();
-			return NULL;
+				goto err_xdp;
+			goto out;
 		case XDP_TX:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			if (tun_xdp_tx(tun->dev, &xdp) < 0)
-				goto err_redirect;
-			rcu_read_unlock();
-			local_bh_enable();
-			return NULL;
+				goto err_xdp;
+			goto out;
 		case XDP_PASS:
 			delta = orig_data - xdp.data;
 			len = xdp.data_end - xdp.data;
@@ -1730,23 +1725,23 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	local_bh_enable();
 
 	skb = build_skb(buf, buflen);
-	if (!skb)
-		return ERR_PTR(-ENOMEM);
+	if (!skb) {
+		skb = ERR_PTR(-ENOMEM);
+		goto out;
+	}
 
 	skb_reserve(skb, pad - delta);
 	skb_put(skb, len);
-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
 
 	return skb;
 
-err_redirect:
-	put_page(alloc_frag->page);
 err_xdp:
+	alloc_frag->offset -= buflen;
+	put_page(alloc_frag->page);
+out:
 	rcu_read_unlock();
 	local_bh_enable();
-	this_cpu_inc(tun->pcpu_stats->rx_dropped);
-	return NULL;
+	return skb;
 }
 
 /* Get packet from user space buffer */
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 05/11] tuntap: tweak on the path of non-xdp case in tun_build_skb()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

If we're sure not to go native XDP, there's no need for several things
like bh and rcu stuffs.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f8cdcfa392c3..389aa0727cc6 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1675,10 +1675,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	 * of xdp_prog above, this should be rare and for simplicity
 	 * we do XDP on skb in case the headroom is not enough.
 	 */
-	if (hdr->gso_type || !xdp_prog)
+	if (hdr->gso_type || !xdp_prog) {
 		*skb_xdp = 1;
-	else
-		*skb_xdp = 0;
+		goto build;
+	}
+
+	*skb_xdp = 0;
 
 	local_bh_disable();
 	rcu_read_lock();
@@ -1724,6 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	rcu_read_unlock();
 	local_bh_enable();
 
+build:
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		skb = ERR_PTR(-ENOMEM);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 06/11] tuntap: split out XDP logic
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch split out XDP logic into a single function. This make it to
be reused by XDP batching path in the following patch.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 84 ++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 389aa0727cc6..21b125020b3b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1635,6 +1635,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 	return true;
 }
 
+static u32 tun_do_xdp(struct tun_struct *tun,
+		      struct tun_file *tfile,
+		      struct bpf_prog *xdp_prog,
+		      struct xdp_buff *xdp,
+		      int *err)
+{
+	u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (act) {
+	case XDP_REDIRECT:
+		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+		xdp_do_flush_map();
+		if (*err)
+			break;
+		goto out;
+	case XDP_TX:
+		*err = tun_xdp_tx(tun->dev, xdp);
+		if (*err < 0)
+			break;
+		*err = 0;
+		goto out;
+	case XDP_PASS:
+		goto out;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(tun->dev, xdp_prog, act);
+		/* fall through */
+	case XDP_DROP:
+		break;
+	}
+
+	put_page(virt_to_head_page(xdp->data_hard_start));
+out:
+	return act;
+}
+
 static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     struct tun_file *tfile,
 				     struct iov_iter *from,
@@ -1645,10 +1683,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	struct sk_buff *skb = NULL;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	unsigned int delta = 0;
 	char *buf;
 	size_t copied;
-	int err, pad = TUN_RX_PAD;
+	int pad = TUN_RX_PAD;
+	int err = 0;
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(tun->xdp_prog);
@@ -1685,9 +1723,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	local_bh_disable();
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(tun->xdp_prog);
-	if (xdp_prog && !*skb_xdp) {
+	if (xdp_prog) {
 		struct xdp_buff xdp;
-		void *orig_data;
 		u32 act;
 
 		xdp.data_hard_start = buf;
@@ -1695,33 +1732,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		xdp.rxq = &tfile->xdp_rxq;
-		orig_data = xdp.data;
-		act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
-		switch (act) {
-		case XDP_REDIRECT:
-			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
-			xdp_do_flush_map();
-			if (err)
-				goto err_xdp;
-			goto out;
-		case XDP_TX:
-			if (tun_xdp_tx(tun->dev, &xdp) < 0)
-				goto err_xdp;
-			goto out;
-		case XDP_PASS:
-			delta = orig_data - xdp.data;
-			len = xdp.data_end - xdp.data;
-			break;
-		default:
-			bpf_warn_invalid_xdp_action(act);
-			/* fall through */
-		case XDP_ABORTED:
-			trace_xdp_exception(tun->dev, xdp_prog, act);
-			/* fall through */
-		case XDP_DROP:
+		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
+		if (err)
 			goto err_xdp;
-		}
+		if (act != XDP_PASS)
+			goto out;
+
+		pad = xdp.data - xdp.data_hard_start;
+		len = xdp.data_end - xdp.data;
 	}
 	rcu_read_unlock();
 	local_bh_enable();
@@ -1729,18 +1747,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 build:
 	skb = build_skb(buf, buflen);
 	if (!skb) {
+		put_page(alloc_frag->page);
 		skb = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 
-	skb_reserve(skb, pad - delta);
+	skb_reserve(skb, pad);
 	skb_put(skb, len);
 
 	return skb;
 
 err_xdp:
-	alloc_frag->offset -= buflen;
-	put_page(alloc_frag->page);
+	this_cpu_inc(tun->pcpu_stats->rx_dropped);
 out:
 	rcu_read_unlock();
 	local_bh_enable();
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This will allow adding batch flushing on top.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 21b125020b3b..ff1cbf3ebd50 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
 	switch (act) {
 	case XDP_REDIRECT:
 		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
-		xdp_do_flush_map();
 		if (*err)
 			break;
 		goto out;
@@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
 		if (err)
 			goto err_xdp;
+
+		if (act == XDP_REDIRECT)
+			xdp_do_flush_map();
 		if (act != XDP_PASS)
 			goto out;
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 08/11] tun: switch to new type of msg_control
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch introduces to a new tun/tap specific msg_control:

#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR  2
struct tun_msg_ctl {
       int type;
       void *ptr;
};

This allows us to pass different kinds of msg_control through
sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will
be used by the existed vhost_net zerocopy code. The second is XDP
buff, which allows vhost_net to pass XDP buff to TUN. This could be
used to implement accepting an array of XDP buffs from vhost_net in
the following patches.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tap.c      | 18 ++++++++++++------
 drivers/net/tun.c      |  6 +++++-
 drivers/vhost/net.c    |  7 +++++--
 include/linux/if_tun.h |  7 +++++++
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f0f7cd977667..7996ed7cbf18 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
 #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
 
 /* Get packet from user space buffer */
-static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
+static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 			    struct iov_iter *from, int noblock)
 {
 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
@@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
 		struct iov_iter i;
 
 		copylen = vnet_hdr.hdr_len ?
@@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->destructor_arg = msg_control;
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	} else if (m && m->msg_control) {
-		struct ubuf_info *uarg = m->msg_control;
+	} else if (msg_control) {
+		struct ubuf_info *uarg = msg_control;
 		uarg->callback(uarg, false);
 	}
 
@@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
-	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+	struct tun_msg_ctl *ctl = m->msg_control;
+
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
+			    m->msg_flags & MSG_DONTWAIT);
 }
 
 static int tap_recvmsg(struct socket *sock, struct msghdr *m,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ff1cbf3ebd50..c839a4bdcbd9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2429,11 +2429,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
+	struct tun_msg_ctl *ctl = m->msg_control;
 
 	if (!tun)
 		return -EBADFD;
 
-	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
 	tun_put(tun);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4e656f89cb22..fb01ce6d981c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		.msg_controllen = 0,
 		.msg_flags = MSG_DONTWAIT,
 	};
+	struct tun_msg_ctl ctl;
 	size_t len, total_len = 0;
 	int err;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
 			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
+			msg.msg_control = &ctl;
+			ctl.type = TUN_MSG_UBUF;
+			ctl.ptr = ubuf;
+			msg.msg_controllen = sizeof(ctl);
 			ubufs = nvq->ubufs;
 			atomic_inc(&ubufs->refcount);
 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3d2996dc7d85..ba46dced1f38 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,13 @@
 
 #define TUN_XDP_FLAG 0x1UL
 
+#define TUN_MSG_UBUF 1
+#define TUN_MSG_PTR  2
+struct tun_msg_ctl {
+	int type;
+	void *ptr;
+};
+
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. If an XDP program is attached, tuntap can run
XDP program directly. If not, tuntap will build skb and do a fast
receiving since part of the work has been done by vhost_net.

This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c839a4bdcbd9..069db2e5dd08 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
 }
 
+static int tun_xdp_one(struct tun_struct *tun,
+		       struct tun_file *tfile,
+		       struct xdp_buff *xdp, int *flush)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	struct tun_pcpu_stats *stats;
+	struct bpf_prog *xdp_prog;
+	struct sk_buff *skb = NULL;
+	u32 rxhash = 0, act;
+	int buflen = *(int *)xdp->data_hard_start;
+	int err = 0;
+	bool skb_xdp = false;
+
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	if (xdp_prog) {
+		if (gso->gso_type) {
+			skb_xdp = true;
+			goto build;
+		}
+		xdp_set_data_meta_invalid(xdp);
+		xdp->rxq = &tfile->xdp_rxq;
+		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
+		if (err)
+			goto out;
+		if (act == XDP_REDIRECT)
+			*flush = true;
+		if (act != XDP_PASS)
+			goto out;
+	}
+
+build:
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+		kfree_skb(skb);
+		err = -EINVAL;
+		goto out;
+	}
+
+	skb->protocol = eth_type_trans(skb, tun->dev);
+	skb_reset_network_header(skb);
+	skb_probe_transport_header(skb, 0);
+
+	if (skb_xdp) {
+		err = do_xdp_generic(xdp_prog, skb);
+		if (err != XDP_PASS)
+			goto out;
+	}
+
+	if (!rcu_dereference(tun->steering_prog))
+		rxhash = __skb_get_hash_symmetric(skb);
+
+	netif_receive_skb(skb);
+
+	stats = get_cpu_ptr(tun->pcpu_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+	put_cpu_ptr(stats);
+
+	if (rxhash)
+		tun_flow_update(tun, rxhash, tfile);
+
+out:
+	return err;
+}
+
 static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
-	int ret;
+	int ret, i;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
 
 	if (!tun)
 		return -EBADFD;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		int n = ctl->type >> 16;
+		int flush = 0;
+
+		local_bh_disable();
+		rcu_read_lock();
+
+		for (i = 0; i < n; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tun_xdp_one(tun, tfile, xdp, &flush);
+		}
+
+		if (flush)
+			xdp_do_flush_map();
+
+		rcu_read_unlock();
+		local_bh_enable();
+
+		ret = total_len;
+		goto out;
+	}
 
 	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
+out:
 	tun_put(tun);
 	return ret;
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. Tap will build skb through those XDP buffers.

This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 7996ed7cbf18..50eb7bf22225 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
 #endif
 };
 
+static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	int buflen = *(int *)xdp->data_hard_start;
+	int vnet_hdr_len = 0;
+	struct tap_dev *tap;
+	struct sk_buff *skb;
+	int err, depth;
+
+	if (q->flags & IFF_VNET_HDR)
+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
+		if (err)
+			goto err_kfree;
+	}
+
+	skb_probe_transport_header(skb, ETH_HLEN);
+
+	/* Move network header to the right position for VLAN tagged packets */
+	if ((skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD)) &&
+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+		skb_set_network_header(skb, depth);
+
+	rcu_read_lock();
+	tap = rcu_dereference(q->tap);
+	if (tap) {
+		skb->dev = tap->dev;
+		dev_queue_xmit(skb);
+	} else {
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+
+	return 0;
+
+err_kfree:
+	kfree_skb(skb);
+err:
+	rcu_read_lock();
+		tap = rcu_dereference(q->tap);
+	if (tap && tap->count_tx_dropped)
+		tap->count_tx_dropped(tap);
+	rcu_read_unlock();
+	return err;
+}
+
 static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
+	int i;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		for (i = 0; i < ctl->type >> 16; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tap_get_user_xdp(q, xdp);
+		}
+		return 0;
+	}
 
 	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			    m->msg_flags & MSG_DONTWAIT);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, kvm, virtualization
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch implements XDP batching for vhost_net. The idea is first to
try to do userspace copy and build XDP buff directly in vhost. Instead
of submitting the packet immediately, vhost_net will batch them in an
array and submit every 64 (VHOST_NET_BATCH) packets to the under layer
sockets through msg_control of sendmsg().

When XDP is enabled on the TUN/TAP, TUN/TAP can process XDP inside a
loop without caring GUP thus it can do batch map flushing. When XDP is
not enabled or not supported, the underlayer socket need to build skb
and pass it to network core. The batched packet submission allows us
to do batching like netif_receive_skb_list() in the future.

This saves lots of indirect calls for better cache utilization. For
the case that we can't so batching e.g when sndbuf is limited or
packet size is too large, we will go for usual one packet per
sendmsg() way.

Doing testpmd on various setups gives us:

Test                /+pps%
XDP_DROP on TAP     /+44.8%
XDP_REDIRECT on TAP /+29%
macvtap (skb)       /+26%

Netperf tests shows obvious improvements for small packet transmission:

size/session/+thu%/+normalize%
   64/     1/   +2%/    0%
   64/     2/   +3%/   +1%
   64/     4/   +7%/   +5%
   64/     8/   +8%/   +6%
  256/     1/   +3%/    0%
  256/     2/  +10%/   +7%
  256/     4/  +26%/  +22%
  256/     8/  +27%/  +23%
  512/     1/   +3%/   +2%
  512/     2/  +19%/  +14%
  512/     4/  +43%/  +40%
  512/     8/  +45%/  +41%
 1024/     1/   +4%/    0%
 1024/     2/  +27%/  +21%
 1024/     4/  +38%/  +73%
 1024/     8/  +15%/  +24%
 2048/     1/  +10%/   +7%
 2048/     2/  +16%/  +12%
 2048/     4/    0%/   +2%
 2048/     8/    0%/   +2%
 4096/     1/  +36%/  +60%
 4096/     2/  -11%/  -26%
 4096/     4/    0%/  +14%
 4096/     8/    0%/   +4%
16384/     1/   -1%/   +5%
16384/     2/    0%/   +2%
16384/     4/    0%/   -3%
16384/     8/    0%/   +4%
65535/     1/    0%/  +10%
65535/     2/    0%/   +8%
65535/     4/    0%/   +1%
65535/     8/    0%/   +3%

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 164 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 151 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index fb01ce6d981c..1dd4239cbff8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -116,6 +116,7 @@ struct vhost_net_virtqueue {
 	 * For RX, number of batched heads
 	 */
 	int done_idx;
+	int batched_xdp;
 	/* an array of userspace buffers info */
 	struct ubuf_info *ubuf_info;
 	/* Reference counting for outstanding ubufs.
@@ -123,6 +124,7 @@ struct vhost_net_virtqueue {
 	struct vhost_net_ubuf_ref *ubufs;
 	struct ptr_ring *rx_ring;
 	struct vhost_net_buf rxq;
+	struct xdp_buff xdp[VHOST_NET_BATCH];
 };
 
 struct vhost_net {
@@ -338,6 +340,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
 		sock_flag(sock->sk, SOCK_ZEROCOPY);
 }
 
+static bool vhost_sock_xdp(struct socket *sock)
+{
+	return sock_flag(sock->sk, SOCK_XDP);
+}
+
 /* In case of DMA done not in order in lower device driver for some reason.
  * upend_idx is used to track end of used idx, done_idx is used to track head
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
@@ -444,10 +451,36 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
 	nvq->done_idx = 0;
 }
 
+static void vhost_tx_batch(struct vhost_net *net,
+			   struct vhost_net_virtqueue *nvq,
+			   struct socket *sock,
+			   struct msghdr *msghdr)
+{
+	struct tun_msg_ctl ctl = {
+		.type = nvq->batched_xdp << 16 | TUN_MSG_PTR,
+		.ptr = nvq->xdp,
+	};
+	int err;
+
+	if (nvq->batched_xdp == 0)
+		goto signal_used;
+
+	msghdr->msg_control = &ctl;
+	err = sock->ops->sendmsg(sock, msghdr, 0);
+	if (unlikely(err < 0)) {
+		vq_err(&nvq->vq, "Fail to batch sending packets\n");
+		return;
+	}
+
+signal_used:
+	vhost_net_signal_used(nvq);
+	nvq->batched_xdp = 0;
+}
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_net_virtqueue *nvq,
 				    unsigned int *out_num, unsigned int *in_num,
-				    bool *busyloop_intr)
+				    struct msghdr *msghdr, bool *busyloop_intr)
 {
 	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned long uninitialized_var(endtime);
@@ -455,8 +488,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				  out_num, in_num, NULL, NULL);
 
 	if (r == vq->num && vq->busyloop_timeout) {
+		/* Flush batched packets first */
 		if (!vhost_sock_zcopy(vq->private_data))
-			vhost_net_signal_used(nvq);
+			vhost_tx_batch(net, nvq, vq->private_data, msghdr);
 		preempt_disable();
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(endtime)) {
@@ -512,7 +546,7 @@ static int get_tx_bufs(struct vhost_net *net,
 	struct vhost_virtqueue *vq = &nvq->vq;
 	int ret;
 
-	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
+	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
 
 	if (ret < 0 || ret == vq->num)
 		return ret;
@@ -540,6 +574,83 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
 	       !vhost_vq_avail_empty(vq->dev, vq);
 }
 
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
+			       struct iov_iter *from)
+{
+	struct vhost_virtqueue *vq = &nvq->vq;
+	struct socket *sock = vq->private_data;
+	struct page_frag *alloc_frag = &current->task_frag;
+	struct virtio_net_hdr *gso;
+	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
+	size_t len = iov_iter_count(from);
+	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
+	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
+	int sock_hlen = nvq->sock_hlen;
+	void *buf;
+	int copied;
+
+	if (unlikely(len < nvq->sock_hlen))
+		return -EFAULT;
+
+	if (SKB_DATA_ALIGN(len + pad) +
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+		return -ENOSPC;
+
+	buflen += SKB_DATA_ALIGN(len + pad);
+	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+		return -ENOMEM;
+
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+
+	/* We store two kinds of metadata in the header which will be
+	 * used for XDP_PASS to do build_skb():
+	 * offset 0: buflen
+	 * offset sizeof(int): vnet header
+	 */
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + sizeof(int),
+				     sock_hlen, from);
+	if (copied != sock_hlen)
+		return -EFAULT;
+
+	gso = (struct virtio_net_hdr *)(buf + sizeof(int));
+
+	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+	    vhost16_to_cpu(vq, gso->csum_start) +
+	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
+	    vhost16_to_cpu(vq, gso->hdr_len)) {
+		gso->hdr_len = cpu_to_vhost16(vq,
+			       vhost16_to_cpu(vq, gso->csum_start) +
+			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
+
+		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
+			return -EINVAL;
+	}
+
+	len -= sock_hlen;
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + pad,
+				     len, from);
+	if (copied != len)
+		return -EFAULT;
+
+	xdp->data_hard_start = buf;
+	xdp->data = buf + pad;
+	xdp->data_end = xdp->data + len;
+	*(int *)(xdp->data_hard_start) = buflen;
+
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
+	++nvq->batched_xdp;
+
+	return 0;
+}
+
 static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 	size_t len, total_len = 0;
 	int err;
 	int sent_pkts = 0;
+	bool bulking = (sock->sk->sk_sndbuf == INT_MAX);
 
 	for (;;) {
 		bool busyloop_intr = false;
 
+		if (nvq->done_idx == VHOST_NET_BATCH)
+			vhost_tx_batch(net, nvq, sock, &msg);
+
 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
 				   &busyloop_intr);
 		/* On error, stop handling until the next kick. */
@@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			break;
 		}
 
-		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
-		vq->heads[nvq->done_idx].len = 0;
-
 		total_len += len;
-		if (tx_can_batch(vq, total_len))
-			msg.msg_flags |= MSG_MORE;
-		else
-			msg.msg_flags &= ~MSG_MORE;
+
+		/* For simplicity, TX batching is only enabled if
+		 * sndbuf is unlimited.
+		 */
+		if (bulking) {
+			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
+			if (!err) {
+				goto done;
+			} else if (unlikely(err != -ENOSPC)) {
+				vhost_tx_batch(net, nvq, sock, &msg);
+				vhost_discard_vq_desc(vq, 1);
+				vhost_net_enable_vq(net, vq);
+				break;
+			}
+
+			/* We can't build XDP buff, go for single
+			 * packet path but let's flush batched
+			 * packets.
+			 */
+			vhost_tx_batch(net, nvq, sock, &msg);
+			msg.msg_control = NULL;
+		} else {
+			if (tx_can_batch(vq, total_len))
+				msg.msg_flags |= MSG_MORE;
+			else
+				msg.msg_flags &= ~MSG_MORE;
+		}
 
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(sock, &msg, len);
@@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 		if (err != len)
 			pr_debug("Truncated TX packet: len %d != %zd\n",
 				 err, len);
-		if (++nvq->done_idx >= VHOST_NET_BATCH)
-			vhost_net_signal_used(nvq);
+done:
+		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
+		vq->heads[nvq->done_idx].len = 0;
+		++nvq->done_idx;
 		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
 	}
 
-	vhost_net_signal_used(nvq);
+	vhost_tx_batch(net, nvq, sock, &msg);
 }
 
 static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
@@ -1111,6 +1248,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		n->vqs[i].ubuf_info = NULL;
 		n->vqs[i].upend_idx = 0;
 		n->vqs[i].done_idx = 0;
+		n->vqs[i].batched_xdp = 0;
 		n->vqs[i].vhost_hlen = 0;
 		n->vqs[i].sock_hlen = 0;
 		n->vqs[i].rx_ring = NULL;
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH 03/11] UAPI: virtio_net: Fix use of C++ keywords as structural members
From: David Howells @ 2018-09-06  7:09 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kbuild, linux-api, linux-kernel, virtualization, dhowells
In-Reply-To: <20180905125636-mutt-send-email-mst@kernel.org>

Michael S. Tsirkin <mst@redhat.com> wrote:

> As long as you do not intend to use any classes, how about
> simply adding
> 
> -Dclass=_class
> 
> to your command line?

That kind of misses the point;-).  It's not reasonable to expect all userspace
C++ users to do this.

David

^ permalink raw reply

* Re: [PATCH] bochs: convert to drm_fb_helper_fbdev_setup/teardown
From: Gerd Hoffmann @ 2018-09-06  7:23 UTC (permalink / raw)
  To: Peter Wu; +Cc: David Airlie, linux-kernel, dri-devel, virtualization
In-Reply-To: <20180905150630.GA25979@al>

  Hi,

> > You can probably get rid of this one if you're refactoring even more. The
> > generic fb_probe implementation (already merged) plus gem-shmem support
> > for it (still in flight) from Noralf should be able to pull that off. That
> > gives you the fb_mmap implementation, but with 100% generic code instead
> > of a driver specific hack like Max did.
> 
> Aside from the warning, I have not observed actual issues. This patch
> was prepared on top of v4.18.1 but the new drm_fb_helper_generic_probe
> helper is in master (future 4.19). I suppose that it can be done as a
> future cleanup. Nice work Noralf on reducing duplication!

FYI: qemu kms driver patches go through drm-misc-next, so you can also
work against that branch.

> > I'll leave merging to Gerd.
> 
> Thanks, I somehow missed a patch. This one does not compile due to
> "fb.initialized" still being used in bochs_drv.c. Removal is trivial,
> I'll wait for some more feedback and then send a v2 with another patch
> prepended.

Patch looks good to me (except for the build failure which you've
noticed already).

cheers,
  Gerd

^ permalink raw reply

* [RFC] UAPI: Check headers by compiling all together as C++
From: David Howells @ 2018-09-06  9:18 UTC (permalink / raw)
  To: linux-api, linux-kbuild
  Cc: Michael S. Tsirkin, David Airlie, Mat Martineau, dri-devel,
	virtualization, dhowells, Masahiro Yamada, keyrings,
	Ryusuke Konishi, Yann Droneaud, linux-nilfs, linux-nvdimm,
	codalist, coda, coreteam, Kent Overstreet, linux-arm-msm, Coly Li,
	linux-bcache, Dan Williams, Jan Harkes, Michal Marek,
	linux-kernel, Rob Clark, netfilter-devel, linux-fsdevel


Here's a set of patches that inserts a step into the build process to make
sure that the UAPI headers can all be built together with C++ (if the
compiler being used supports C++).

Note that it's based on a commit from the sound tree to fix usage of u32
and co..

Most of the patches perform fixups, including:

 (1) Fix member names that conflict with C++ reserved words by providing
     alternates that can be used anywhere.  An anonymous union is used so
     that that the conflicting name is still available outside of C++.

 (2) Fix the use of flexible arrays in structs that get embedded (which is
     illegal in C++).

 (3) Remove the use of internal kernel structs in UAPI structures.

 (4) Fix symbol collisions.

 (5) Fix use of sparsely initialised arrays (which g++ doesn't implement).

 (6) Remove some use of PAGE_SIZE since this isn't valid outside of the
     kernel.

There's also:

 (7) Move the coda_psdev.h header file to fs/coda/.

And lastly:

 (8) Compile all of the UAPI headers (with a few exceptions) together as
     C++ to catch new errors occurring as part of the regular build
     process.

Changes for v2:

 - Merge commit from sound tree to fix u32 usage issues
 - Use a switch to fix sparse array initialisation
 - Simplify nilfs2 by performing bitwise ops in LE space not CPU space
 - Handle conflicting fix to use of 'private' in keyctl.h
 - Move kernel internal coda bits to coda internal headers
 - Move coda_psdev.h header to fs/coda/.

The patches can also be found here:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=uapi-check

Thanks,
David
---
David Howells (11):
      UAPI: drm: Fix use of C++ keywords as structural members
      UAPI: keys: Fix use of C++ keywords as structural members
      UAPI: virtio_net: Fix use of C++ keywords as structural members
      UAPI: bcache: Fix use of embedded flexible array
      UAPI: coda: Move kernel internals out of public view
      coda: Move internal defs out of include/linux/
      UAPI: netfilter: Fix symbol collision issues
      UAPI: nilfs2: Fix use of undefined byteswapping functions
      UAPI: ndctl: Fix g++-unsupported initialisation in headers
      UAPI: ndctl: Remove use of PAGE_SIZE
      UAPI: Check headers build for C++


 Makefile                                          |    1 
 fs/coda/cache.c                                   |    2 
 fs/coda/cnode.c                                   |    2 
 fs/coda/coda_linux.c                              |    2 
 fs/coda/coda_psdev.h                              |   88 +++++++++++++++
 fs/coda/dir.c                                     |    2 
 fs/coda/file.c                                    |    3 -
 fs/coda/inode.c                                   |    2 
 fs/coda/pioctl.c                                  |    3 -
 fs/coda/psdev.c                                   |    3 -
 fs/coda/symlink.c                                 |    3 -
 fs/coda/upcall.c                                  |    2 
 include/linux/coda_psdev.h                        |   72 ------------
 include/linux/ndctl.h                             |   22 ++++
 include/uapi/drm/i810_drm.h                       |    7 +
 include/uapi/drm/msm_drm.h                        |    7 +
 include/uapi/linux/bcache.h                       |    2 
 include/uapi/linux/coda_psdev.h                   |   18 ---
 include/uapi/linux/keyctl.h                       |    7 +
 include/uapi/linux/ndctl.h                        |   52 ++++-----
 include/uapi/linux/netfilter/nfnetlink_cthelper.h |    2 
 include/uapi/linux/netfilter_ipv4/ipt_ECN.h       |    9 --
 include/uapi/linux/nilfs2_ondisk.h                |   28 ++---
 include/uapi/linux/virtio_net.h                   |    7 +
 scripts/headers-c++.sh                            |  124 +++++++++++++++++++++
 25 files changed, 304 insertions(+), 166 deletions(-)
 create mode 100644 fs/coda/coda_psdev.h
 delete mode 100644 include/linux/coda_psdev.h
 create mode 100644 include/linux/ndctl.h
 create mode 100755 scripts/headers-c++.sh

^ permalink raw reply

* [PATCH 03/11] UAPI: virtio_net: Fix use of C++ keywords as structural members [ver #2]
From: David Howells @ 2018-09-06  9:18 UTC (permalink / raw)
  To: linux-api, linux-kbuild
  Cc: dhowells, virtualization, linux-kernel, Michael S. Tsirkin
In-Reply-To: <153622549721.14298.8116794954073122489.stgit@warthog.procyon.org.uk>

The virtio_net_ctrl_hdr struct uses a C++ keyword as structural members.  Fix
this by inserting an anonymous union that provides an alternative name and
then hide the reserved name in C++.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "Michael S. Tsirkin" <mst@redhat.com>
cc: Jason Wang <jasowang@redhat.com>
cc: virtualization@lists.linux-foundation.org
---

 include/uapi/linux/virtio_net.h |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index a3715a3224c1..967142bc0e05 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -150,7 +150,12 @@ struct virtio_net_hdr_mrg_rxbuf {
  * command goes in between.
  */
 struct virtio_net_ctrl_hdr {
-	__u8 class;
+	union {
+#ifndef __cplusplus
+		__u8 class;
+#endif
+		__u8 _class;
+	};
 	__u8 cmd;
 } __attribute__((packed));

^ permalink raw reply related

* Re: [PATCH v36 0/5] Virtio-balloon: support free page reporting
From: Wei Wang @ 2018-09-06 12:18 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, Michael S. Tsirkin
  Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
	liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
	akpm, virtualization, torvalds
In-Reply-To: <20180723143604.GB2457@work-vm>

On 07/23/2018 10:36 PM, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (mst@redhat.com) wrote:
>> On Fri, Jul 20, 2018 at 04:33:00PM +0800, Wei Wang wrote:
>>> This patch series is separated from the previous "Virtio-balloon
>>> Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>>> implemented by this series enables the virtio-balloon driver to report
>>> hints of guest free pages to the host. It can be used to accelerate live
>>> migration of VMs. Here is an introduction of this usage:
>>>
>>> Live migration needs to transfer the VM's memory from the source machine
>>> to the destination round by round. For the 1st round, all the VM's memory
>>> is transferred. From the 2nd round, only the pieces of memory that were
>>> written by the guest (after the 1st round) are transferred. One method
>>> that is popularly used by the hypervisor to track which part of memory is
>>> written is to write-protect all the guest memory.
>>>
>>> This feature enables the optimization by skipping the transfer of guest
>>> free pages during VM live migration. It is not concerned that the memory
>>> pages are used after they are given to the hypervisor as a hint of the
>>> free pages, because they will be tracked by the hypervisor and transferred
>>> in the subsequent round if they are used and written.
>>>
>>> * Tests
>>> - Test Environment
>>>      Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
>>>      Guest: 8G RAM, 4 vCPU
>>>      Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
>>>
>>> - Test Results
>>>      - Idle Guest Live Migration Time (results are averaged over 10 runs):
>>>          - Optimization v.s. Legacy = 409ms vs 1757ms --> ~77% reduction
>>> 	(setting page poisoning zero and enabling ksm don't affect the
>>>           comparison result)
>>>      - Guest with Linux Compilation Workload (make bzImage -j4):
>>>          - Live Migration Time (average)
>>>            Optimization v.s. Legacy = 1407ms v.s. 2528ms --> ~44% reduction
>>>          - Linux Compilation Time
>>>            Optimization v.s. Legacy = 5min4s v.s. 5min12s
>>>            --> no obvious difference
>> I'd like to see dgilbert's take on whether this kind of gain
>> justifies adding a PV interfaces, and what kind of guest workload
>> is appropriate.
>>
>> Cc'd.
> Well, 44% is great ... although the measurement is a bit weird.
>
> a) A 2 second downtime is very large; 300-500ms is more normal
> b) I'm not sure what the 'average' is  - is that just between a bunch of
> repeated migrations?
> c) What load was running in the guest during the live migration?
>
> An interesting measurement to add would be to do the same test but
> with a VM with a lot more RAM but the same load;  you'd hope the gain
> would be even better.
> It would be interesting, especially because the users who are interested
> are people creating VMs allocated with lots of extra memory (for the
> worst case) but most of the time migrating when it's fairly idle.
>
> Dave
>

Hi Dave,

The results of the added experiments have been shown in the v37 cover 
letter.
Could you have a look at https://lkml.org/lkml/2018/8/27/29 . Thanks.

Best,
Wei

^ permalink raw reply

* Re: [PATCH 03/11] UAPI: virtio_net: Fix use of C++ keywords as structural members
From: Michael S. Tsirkin @ 2018-09-06 14:36 UTC (permalink / raw)
  To: David Howells; +Cc: linux-api, virtualization, linux-kernel, linux-kbuild
In-Reply-To: <9357.1536217759@warthog.procyon.org.uk>

On Thu, Sep 06, 2018 at 08:09:19AM +0100, David Howells wrote:
> Michael S. Tsirkin <mst@redhat.com> wrote:
> 
> > As long as you do not intend to use any classes, how about
> > simply adding
> > 
> > -Dclass=_class
> > 
> > to your command line?
> 
> That kind of misses the point;-).  It's not reasonable to expect all userspace
> C++ users to do this.
> 
> David

I thought one of the points was that building kernel with c++ catches
some bugs, no?  If the point is to make life easier for c++ userspace
I'm not sure what we can do to be frank. C++ seems to be adding new
keywords with no restraint (C99 did it with inline and restrict too, but
it seems this stopped) so no good way to future-proof code for all
language dialects.

So I'd like to know which are the actual c++ users asking for this - we
can then accomodate the specific version they need.
Meanwhile people can get by with a wrapper along the lines of
#define class _class
#include <linux/virtio_net.h>
#undef class

-- 
MST

^ permalink raw reply

* Re: [PATCH 03/11] UAPI: virtio_net: Fix use of C++ keywords as structural members [ver #2]
From: Michael S. Tsirkin @ 2018-09-06 15:02 UTC (permalink / raw)
  To: David Howells; +Cc: linux-api, virtualization, linux-kernel, linux-kbuild
In-Reply-To: <153622552270.14298.14568295161017777604.stgit@warthog.procyon.org.uk>

On Thu, Sep 06, 2018 at 10:18:42AM +0100, David Howells wrote:
> The virtio_net_ctrl_hdr struct uses a C++ keyword as structural members.  Fix
> this by inserting an anonymous union that provides an alternative name and
> then hide the reserved name in C++.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: "Michael S. Tsirkin" <mst@redhat.com>
> cc: Jason Wang <jasowang@redhat.com>
> cc: virtualization@lists.linux-foundation.org
> ---
> 
>  include/uapi/linux/virtio_net.h |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
> index a3715a3224c1..967142bc0e05 100644
> --- a/include/uapi/linux/virtio_net.h
> +++ b/include/uapi/linux/virtio_net.h
> @@ -150,7 +150,12 @@ struct virtio_net_hdr_mrg_rxbuf {
>   * command goes in between.
>   */
>  struct virtio_net_ctrl_hdr {
> -	__u8 class;
> +	union {
> +#ifndef __cplusplus
> +		__u8 class;
> +#endif
> +		__u8 _class;
> +	};
>  	__u8 cmd;
>  } __attribute__((packed));
>  

So if we are going to do this, I think I'd prefer something like:

struct virtio_net_ctrl_hdr_v2 {
	__u8 cmd_class;
  	__u8 cmd;
};

And then hide the whole old structure. This also gets rid of the packed
keyword which we don't really need here.

Only issue is virtio_net_ctrl_hdr_v2 is ugly. But oh well.

And then rework at least QEMU to use the v2 of the header.

Quite a bit of churn, so I don't think it makes sense to apply just this
one in isolation - only if rest of the changes go in.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Michael S. Tsirkin @ 2018-09-06 16:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-12-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:26PM +0800, Jason Wang wrote:
> This patch implements XDP batching for vhost_net. The idea is first to
> try to do userspace copy and build XDP buff directly in vhost. Instead
> of submitting the packet immediately, vhost_net will batch them in an
> array and submit every 64 (VHOST_NET_BATCH) packets to the under layer
> sockets through msg_control of sendmsg().
> 
> When XDP is enabled on the TUN/TAP, TUN/TAP can process XDP inside a
> loop without caring GUP thus it can do batch map flushing. When XDP is
> not enabled or not supported, the underlayer socket need to build skb
> and pass it to network core. The batched packet submission allows us
> to do batching like netif_receive_skb_list() in the future.
> 
> This saves lots of indirect calls for better cache utilization. For
> the case that we can't so batching e.g when sndbuf is limited or
> packet size is too large, we will go for usual one packet per
> sendmsg() way.
> 
> Doing testpmd on various setups gives us:
> 
> Test                /+pps%
> XDP_DROP on TAP     /+44.8%
> XDP_REDIRECT on TAP /+29%
> macvtap (skb)       /+26%
> 
> Netperf tests shows obvious improvements for small packet transmission:
> 
> size/session/+thu%/+normalize%
>    64/     1/   +2%/    0%
>    64/     2/   +3%/   +1%
>    64/     4/   +7%/   +5%
>    64/     8/   +8%/   +6%
>   256/     1/   +3%/    0%
>   256/     2/  +10%/   +7%
>   256/     4/  +26%/  +22%
>   256/     8/  +27%/  +23%
>   512/     1/   +3%/   +2%
>   512/     2/  +19%/  +14%
>   512/     4/  +43%/  +40%
>   512/     8/  +45%/  +41%
>  1024/     1/   +4%/    0%
>  1024/     2/  +27%/  +21%
>  1024/     4/  +38%/  +73%
>  1024/     8/  +15%/  +24%
>  2048/     1/  +10%/   +7%
>  2048/     2/  +16%/  +12%
>  2048/     4/    0%/   +2%
>  2048/     8/    0%/   +2%
>  4096/     1/  +36%/  +60%
>  4096/     2/  -11%/  -26%
>  4096/     4/    0%/  +14%
>  4096/     8/    0%/   +4%
> 16384/     1/   -1%/   +5%
> 16384/     2/    0%/   +2%
> 16384/     4/    0%/   -3%
> 16384/     8/    0%/   +4%
> 65535/     1/    0%/  +10%
> 65535/     2/    0%/   +8%
> 65535/     4/    0%/   +1%
> 65535/     8/    0%/   +3%
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c | 164 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 151 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index fb01ce6d981c..1dd4239cbff8 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -116,6 +116,7 @@ struct vhost_net_virtqueue {
>  	 * For RX, number of batched heads
>  	 */
>  	int done_idx;
> +	int batched_xdp;

Pls add a comment documenting what does this new field do.

>  	/* an array of userspace buffers info */
>  	struct ubuf_info *ubuf_info;
>  	/* Reference counting for outstanding ubufs.
> @@ -123,6 +124,7 @@ struct vhost_net_virtqueue {
>  	struct vhost_net_ubuf_ref *ubufs;
>  	struct ptr_ring *rx_ring;
>  	struct vhost_net_buf rxq;
> +	struct xdp_buff xdp[VHOST_NET_BATCH];

This is a bit too much to have inline. 64 entries 48 bytes each, and we
have 2 of these structures, that's already >4K.

>  };
>  
>  struct vhost_net {
> @@ -338,6 +340,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
>  		sock_flag(sock->sk, SOCK_ZEROCOPY);
>  }
>  
> +static bool vhost_sock_xdp(struct socket *sock)
> +{
> +	return sock_flag(sock->sk, SOCK_XDP);

what if an xdp program is attached while this processing
is going on? Flag value will be wrong - is this safe
and why? Pls add a comment.

> +}
> +
>  /* In case of DMA done not in order in lower device driver for some reason.
>   * upend_idx is used to track end of used idx, done_idx is used to track head
>   * of used idx. Once lower device DMA done contiguously, we will signal KVM
> @@ -444,10 +451,36 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
>  	nvq->done_idx = 0;
>  }
>  
> +static void vhost_tx_batch(struct vhost_net *net,
> +			   struct vhost_net_virtqueue *nvq,
> +			   struct socket *sock,
> +			   struct msghdr *msghdr)
> +{
> +	struct tun_msg_ctl ctl = {
> +		.type = nvq->batched_xdp << 16 | TUN_MSG_PTR,
> +		.ptr = nvq->xdp,
> +	};
> +	int err;
> +
> +	if (nvq->batched_xdp == 0)
> +		goto signal_used;
> +
> +	msghdr->msg_control = &ctl;
> +	err = sock->ops->sendmsg(sock, msghdr, 0);
> +	if (unlikely(err < 0)) {
> +		vq_err(&nvq->vq, "Fail to batch sending packets\n");
> +		return;
> +	}
> +
> +signal_used:
> +	vhost_net_signal_used(nvq);
> +	nvq->batched_xdp = 0;
> +}
> +
>  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>  				    struct vhost_net_virtqueue *nvq,
>  				    unsigned int *out_num, unsigned int *in_num,
> -				    bool *busyloop_intr)
> +				    struct msghdr *msghdr, bool *busyloop_intr)
>  {
>  	struct vhost_virtqueue *vq = &nvq->vq;
>  	unsigned long uninitialized_var(endtime);
> @@ -455,8 +488,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>  				  out_num, in_num, NULL, NULL);
>  
>  	if (r == vq->num && vq->busyloop_timeout) {
> +		/* Flush batched packets first */
>  		if (!vhost_sock_zcopy(vq->private_data))
> -			vhost_net_signal_used(nvq);
> +			vhost_tx_batch(net, nvq, vq->private_data, msghdr);
>  		preempt_disable();
>  		endtime = busy_clock() + vq->busyloop_timeout;
>  		while (vhost_can_busy_poll(endtime)) {
> @@ -512,7 +546,7 @@ static int get_tx_bufs(struct vhost_net *net,
>  	struct vhost_virtqueue *vq = &nvq->vq;
>  	int ret;
>  
> -	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
> +	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
>  
>  	if (ret < 0 || ret == vq->num)
>  		return ret;
> @@ -540,6 +574,83 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
>  	       !vhost_vq_avail_empty(vq->dev, vq);
>  }
>  
> +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

I wonder whether NET_IP_ALIGN make sense for XDP.

> +
> +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> +			       struct iov_iter *from)
> +{
> +	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct socket *sock = vq->private_data;
> +	struct page_frag *alloc_frag = &current->task_frag;
> +	struct virtio_net_hdr *gso;
> +	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
> +	size_t len = iov_iter_count(from);
> +	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
> +	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
> +	int sock_hlen = nvq->sock_hlen;
> +	void *buf;
> +	int copied;
> +
> +	if (unlikely(len < nvq->sock_hlen))
> +		return -EFAULT;
> +
> +	if (SKB_DATA_ALIGN(len + pad) +
> +	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> +		return -ENOSPC;
> +
> +	buflen += SKB_DATA_ALIGN(len + pad);
> +	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> +		return -ENOMEM;
> +
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +
> +	/* We store two kinds of metadata in the header which will be
> +	 * used for XDP_PASS to do build_skb():
> +	 * offset 0: buflen
> +	 * offset sizeof(int): vnet header

Define a struct for the metadata then?


> +	 */
> +	copied = copy_page_from_iter(alloc_frag->page,
> +				     alloc_frag->offset + sizeof(int),
> +				     sock_hlen, from);
> +	if (copied != sock_hlen)
> +		return -EFAULT;
> +
> +	gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> +
> +	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> +	    vhost16_to_cpu(vq, gso->csum_start) +
> +	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> +	    vhost16_to_cpu(vq, gso->hdr_len)) {
> +		gso->hdr_len = cpu_to_vhost16(vq,
> +			       vhost16_to_cpu(vq, gso->csum_start) +
> +			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
> +
> +		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> +			return -EINVAL;
> +	}
> +
> +	len -= sock_hlen;
> +	copied = copy_page_from_iter(alloc_frag->page,
> +				     alloc_frag->offset + pad,
> +				     len, from);
> +	if (copied != len)
> +		return -EFAULT;
> +
> +	xdp->data_hard_start = buf;
> +	xdp->data = buf + pad;
> +	xdp->data_end = xdp->data + len;
> +	*(int *)(xdp->data_hard_start) = buflen;
> +
> +	get_page(alloc_frag->page);
> +	alloc_frag->offset += buflen;
> +
> +	++nvq->batched_xdp;
> +
> +	return 0;
> +}
> +
>  static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> @@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  	size_t len, total_len = 0;
>  	int err;
>  	int sent_pkts = 0;
> +	bool bulking = (sock->sk->sk_sndbuf == INT_MAX);

What does bulking mean?

>  
>  	for (;;) {
>  		bool busyloop_intr = false;
>  
> +		if (nvq->done_idx == VHOST_NET_BATCH)
> +			vhost_tx_batch(net, nvq, sock, &msg);
> +
>  		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
>  				   &busyloop_intr);
>  		/* On error, stop handling until the next kick. */
> @@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  			break;
>  		}
>  
> -		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
> -		vq->heads[nvq->done_idx].len = 0;
> -
>  		total_len += len;
> -		if (tx_can_batch(vq, total_len))
> -			msg.msg_flags |= MSG_MORE;
> -		else
> -			msg.msg_flags &= ~MSG_MORE;
> +
> +		/* For simplicity, TX batching is only enabled if
> +		 * sndbuf is unlimited.

What if sndbuf changes while this processing is going on?

> +		 */
> +		if (bulking) {
> +			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
> +			if (!err) {
> +				goto done;
> +			} else if (unlikely(err != -ENOSPC)) {
> +				vhost_tx_batch(net, nvq, sock, &msg);
> +				vhost_discard_vq_desc(vq, 1);
> +				vhost_net_enable_vq(net, vq);
> +				break;
> +			}
> +
> +			/* We can't build XDP buff, go for single
> +			 * packet path but let's flush batched
> +			 * packets.
> +			 */
> +			vhost_tx_batch(net, nvq, sock, &msg);
> +			msg.msg_control = NULL;
> +		} else {
> +			if (tx_can_batch(vq, total_len))
> +				msg.msg_flags |= MSG_MORE;
> +			else
> +				msg.msg_flags &= ~MSG_MORE;
> +		}
>  
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(sock, &msg, len);
> @@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: len %d != %zd\n",
>  				 err, len);
> -		if (++nvq->done_idx >= VHOST_NET_BATCH)
> -			vhost_net_signal_used(nvq);
> +done:
> +		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
> +		vq->heads[nvq->done_idx].len = 0;
> +		++nvq->done_idx;
>  		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
>  			vhost_poll_queue(&vq->poll);
>  			break;
>  		}
>  	}
>  
> -	vhost_net_signal_used(nvq);
> +	vhost_tx_batch(net, nvq, sock, &msg);
>  }
>  
>  static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> @@ -1111,6 +1248,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  		n->vqs[i].ubuf_info = NULL;
>  		n->vqs[i].upend_idx = 0;
>  		n->vqs[i].done_idx = 0;
> +		n->vqs[i].batched_xdp = 0;
>  		n->vqs[i].vhost_hlen = 0;
>  		n->vqs[i].sock_hlen = 0;
>  		n->vqs[i].rx_ring = NULL;
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 08/11] tun: switch to new type of msg_control
From: Michael S. Tsirkin @ 2018-09-06 16:54 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-9-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:23PM +0800, Jason Wang wrote:
> This patch introduces to a new tun/tap specific msg_control:
> 
> #define TUN_MSG_UBUF 1
> #define TUN_MSG_PTR  2
> struct tun_msg_ctl {
>        int type;
>        void *ptr;
> };
> 
> This allows us to pass different kinds of msg_control through
> sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will
> be used by the existed vhost_net zerocopy code. The second is XDP
> buff, which allows vhost_net to pass XDP buff to TUN. This could be
> used to implement accepting an array of XDP buffs from vhost_net in
> the following patches.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

At this point, do we want to just add a new sock opt for tap's
benefit? Seems cleaner than (ab)using sendmsg.

> ---
>  drivers/net/tap.c      | 18 ++++++++++++------
>  drivers/net/tun.c      |  6 +++++-
>  drivers/vhost/net.c    |  7 +++++--
>  include/linux/if_tun.h |  7 +++++++
>  4 files changed, 29 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index f0f7cd977667..7996ed7cbf18 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
>  #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
>  
>  /* Get packet from user space buffer */
> -static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
> +static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
>  			    struct iov_iter *from, int noblock)
>  {
>  	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
> @@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
>  	if (unlikely(len < ETH_HLEN))
>  		goto err;
>  
> -	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
> +	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
>  		struct iov_iter i;
>  
>  		copylen = vnet_hdr.hdr_len ?
> @@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
>  	tap = rcu_dereference(q->tap);
>  	/* copy skb_ubuf_info for callback when skb has no error */
>  	if (zerocopy) {
> -		skb_shinfo(skb)->destructor_arg = m->msg_control;
> +		skb_shinfo(skb)->destructor_arg = msg_control;
>  		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>  		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
> -	} else if (m && m->msg_control) {
> -		struct ubuf_info *uarg = m->msg_control;
> +	} else if (msg_control) {
> +		struct ubuf_info *uarg = msg_control;
>  		uarg->callback(uarg, false);
>  	}
>  
> @@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m,
>  		       size_t total_len)
>  {
>  	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
> -	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
> +	struct tun_msg_ctl *ctl = m->msg_control;
> +
> +	if (ctl && ctl->type != TUN_MSG_UBUF)
> +		return -EINVAL;
> +
> +	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
> +			    m->msg_flags & MSG_DONTWAIT);
>  }
>  
>  static int tap_recvmsg(struct socket *sock, struct msghdr *m,
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ff1cbf3ebd50..c839a4bdcbd9 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2429,11 +2429,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  	int ret;
>  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	struct tun_struct *tun = tun_get(tfile);
> +	struct tun_msg_ctl *ctl = m->msg_control;
>  
>  	if (!tun)
>  		return -EBADFD;
>  
> -	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> +	if (ctl && ctl->type != TUN_MSG_UBUF)
> +		return -EINVAL;
> +
> +	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			   m->msg_flags & MSG_DONTWAIT,
>  			   m->msg_flags & MSG_MORE);
>  	tun_put(tun);
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4e656f89cb22..fb01ce6d981c 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  		.msg_controllen = 0,
>  		.msg_flags = MSG_DONTWAIT,
>  	};
> +	struct tun_msg_ctl ctl;
>  	size_t len, total_len = 0;
>  	int err;
>  	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> @@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  			ubuf->ctx = nvq->ubufs;
>  			ubuf->desc = nvq->upend_idx;
>  			refcount_set(&ubuf->refcnt, 1);
> -			msg.msg_control = ubuf;
> -			msg.msg_controllen = sizeof(ubuf);
> +			msg.msg_control = &ctl;
> +			ctl.type = TUN_MSG_UBUF;
> +			ctl.ptr = ubuf;
> +			msg.msg_controllen = sizeof(ctl);
>  			ubufs = nvq->ubufs;
>  			atomic_inc(&ubufs->refcount);
>  			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> index 3d2996dc7d85..ba46dced1f38 100644
> --- a/include/linux/if_tun.h
> +++ b/include/linux/if_tun.h
> @@ -19,6 +19,13 @@
>  
>  #define TUN_XDP_FLAG 0x1UL
>  
> +#define TUN_MSG_UBUF 1
> +#define TUN_MSG_PTR  2

Looks like TUN_MSG_PTR should be pushed out to a follow-up patch?

> +struct tun_msg_ctl {
> +	int type;
> +	void *ptr;
> +};
> +

type actually includes a size. Why not two short fields then?


>  #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
>  struct socket *tun_get_socket(struct file *);
>  struct ptr_ring *tun_get_tx_ring(struct file *file);
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 01/11] net: sock: introduce SOCK_XDP
From: Michael S. Tsirkin @ 2018-09-06 16:56 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-2-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:16PM +0800, Jason Wang wrote:
> This patch introduces a new sock flag - SOCK_XDP. This will be used
> for notifying the upper layer that XDP program is attached on the
> lower socket, and requires for extra headroom.
> 
> TUN will be the first user.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

In fact vhost is the 1st user, right? So this can be
pushed out to become patch 10/11.

> ---
>  drivers/net/tun.c  | 19 +++++++++++++++++++
>  include/net/sock.h |  1 +
>  2 files changed, 20 insertions(+)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ebd07ad82431..2c548bd20393 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>  		tun_napi_init(tun, tfile, napi);
>  	}
>  
> +	if (rtnl_dereference(tun->xdp_prog))
> +		sock_set_flag(&tfile->sk, SOCK_XDP);
> +
>  	tun_set_real_num_queues(tun);
>  
>  	/* device is allowed to go away first, so no need to hold extra
> @@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  		       struct netlink_ext_ack *extack)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> +	struct tun_file *tfile;
>  	struct bpf_prog *old_prog;
> +	int i;
>  
>  	old_prog = rtnl_dereference(tun->xdp_prog);
>  	rcu_assign_pointer(tun->xdp_prog, prog);
>  	if (old_prog)
>  		bpf_prog_put(old_prog);
>  
> +	for (i = 0; i < tun->numqueues; i++) {
> +		tfile = rtnl_dereference(tun->tfiles[i]);
> +		if (prog)
> +			sock_set_flag(&tfile->sk, SOCK_XDP);
> +		else
> +			sock_reset_flag(&tfile->sk, SOCK_XDP);
> +	}
> +	list_for_each_entry(tfile, &tun->disabled, next) {
> +		if (prog)
> +			sock_set_flag(&tfile->sk, SOCK_XDP);
> +		else
> +			sock_reset_flag(&tfile->sk, SOCK_XDP);
> +	}
> +
>  	return 0;
>  }
>  
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 433f45fc2d68..38cae35f6e16 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -800,6 +800,7 @@ enum sock_flags {
>  	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
>  	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
>  	SOCK_TXTIME,
> +	SOCK_XDP, /* XDP is attached */
>  };
>  
>  #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 02/11] tuntap: switch to use XDP_PACKET_HEADROOM
From: Michael S. Tsirkin @ 2018-09-06 16:57 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-3-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:17PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

any idea why didn't we do this straight away?

> ---
>  drivers/net/tun.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 2c548bd20393..d3677a544b56 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -113,7 +113,6 @@ do {								\
>  } while (0)
>  #endif
>  
> -#define TUN_HEADROOM 256
>  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
>  
>  /* TUN device flags */
> @@ -1654,7 +1653,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(tun->xdp_prog);
>  	if (xdp_prog)
> -		pad += TUN_HEADROOM;
> +		pad += XDP_PACKET_HEADROOM;
>  	buflen += SKB_DATA_ALIGN(len + pad);
>  	rcu_read_unlock();
>  
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 03/11] tuntap: enable bh early during processing XDP
From: Michael S. Tsirkin @ 2018-09-06 17:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-4-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:18PM +0800, Jason Wang wrote:
> This patch move the bh enabling a little bit earlier, this will be
> used for factoring out the core XDP logic of tuntap.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

no reason to disable bh for non-xdp things.

> ---
>  drivers/net/tun.c | 10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d3677a544b56..372caf7d67d9 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1726,22 +1726,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  			goto err_xdp;
>  		}
>  	}
> +	rcu_read_unlock();
> +	local_bh_enable();
>  
>  	skb = build_skb(buf, buflen);
> -	if (!skb) {
> -		rcu_read_unlock();
> -		local_bh_enable();
> +	if (!skb)
>  		return ERR_PTR(-ENOMEM);
> -	}
>  
>  	skb_reserve(skb, pad - delta);
>  	skb_put(skb, len);
>  	get_page(alloc_frag->page);
>  	alloc_frag->offset += buflen;
>  
> -	rcu_read_unlock();
> -	local_bh_enable();
> -
>  	return skb;
>  
>  err_redirect:
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
From: Michael S. Tsirkin @ 2018-09-06 17:14 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-5-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:19PM +0800, Jason Wang wrote:
> There's no need to duplicate page get logic in each action. So this
> patch tries to get page and calculate the offset before processing XDP
> actions, and undo them when meet errors (we don't care the performance
> on errors). This will be used for factoring out XDP logic.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

I see some issues with this one.

> ---
>  drivers/net/tun.c | 37 ++++++++++++++++---------------------
>  1 file changed, 16 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 372caf7d67d9..f8cdcfa392c3 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1642,7 +1642,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  				     int len, int *skb_xdp)
>  {
>  	struct page_frag *alloc_frag = &current->task_frag;
> -	struct sk_buff *skb;
> +	struct sk_buff *skb = NULL;
>  	struct bpf_prog *xdp_prog;
>  	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>  	unsigned int delta = 0;
> @@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	if (copied != len)
>  		return ERR_PTR(-EFAULT);
>  
> +	get_page(alloc_frag->page);
> +	alloc_frag->offset += buflen;
> +

This adds an atomic op on XDP_DROP which is a data path
operation for some workloads.

>  	/* There's a small window that XDP may be set after the check
>  	 * of xdp_prog above, this should be rare and for simplicity
>  	 * we do XDP on skb in case the headroom is not enough.
> @@ -1695,23 +1698,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  
>  		switch (act) {
>  		case XDP_REDIRECT:
> -			get_page(alloc_frag->page);
> -			alloc_frag->offset += buflen;
>  			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
>  			xdp_do_flush_map();
>  			if (err)
> -				goto err_redirect;
> -			rcu_read_unlock();
> -			local_bh_enable();
> -			return NULL;
> +				goto err_xdp;
> +			goto out;
>  		case XDP_TX:
> -			get_page(alloc_frag->page);
> -			alloc_frag->offset += buflen;
>  			if (tun_xdp_tx(tun->dev, &xdp) < 0)
> -				goto err_redirect;
> -			rcu_read_unlock();
> -			local_bh_enable();
> -			return NULL;
> +				goto err_xdp;
> +			goto out;
>  		case XDP_PASS:
>  			delta = orig_data - xdp.data;
>  			len = xdp.data_end - xdp.data;
> @@ -1730,23 +1725,23 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	local_bh_enable();
>  
>  	skb = build_skb(buf, buflen);
> -	if (!skb)
> -		return ERR_PTR(-ENOMEM);
> +	if (!skb) {
> +		skb = ERR_PTR(-ENOMEM);
> +		goto out;

So goto out will skip put_page, and we did
do get_page above. Seems wrong. You should
goto err_skb or something like this.


> +	}
>  
>  	skb_reserve(skb, pad - delta);
>  	skb_put(skb, len);
> -	get_page(alloc_frag->page);
> -	alloc_frag->offset += buflen;
>  
>  	return skb;
>  
> -err_redirect:
> -	put_page(alloc_frag->page);
>  err_xdp:
> +	alloc_frag->offset -= buflen;
> +	put_page(alloc_frag->page);
> +out:

Out here isn't an error at all, is it?  You should not mix return and
error handling IMHO.



>  	rcu_read_unlock();
>  	local_bh_enable();
> -	this_cpu_inc(tun->pcpu_stats->rx_dropped);

Doesn't this break rx_dropped accounting?

> -	return NULL;
> +	return skb;
>  }
>  
>  /* Get packet from user space buffer */
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 05/11] tuntap: tweak on the path of non-xdp case in tun_build_skb()
From: Michael S. Tsirkin @ 2018-09-06 17:16 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-6-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:20PM +0800, Jason Wang wrote:
> If we're sure not to go native XDP, there's no need for several things
> like bh and rcu stuffs.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

True...

> ---
>  drivers/net/tun.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index f8cdcfa392c3..389aa0727cc6 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1675,10 +1675,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	 * of xdp_prog above, this should be rare and for simplicity
>  	 * we do XDP on skb in case the headroom is not enough.
>  	 */
> -	if (hdr->gso_type || !xdp_prog)
> +	if (hdr->gso_type || !xdp_prog) {
>  		*skb_xdp = 1;
> -	else
> -		*skb_xdp = 0;
> +		goto build;
> +	}
> +
> +	*skb_xdp = 0;
>  
>  	local_bh_disable();
>  	rcu_read_lock();
> @@ -1724,6 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	rcu_read_unlock();
>  	local_bh_enable();
>  
> +build:

But this is spaghetti code. Please just put common
code into functions and call them, don't goto.

>  	skb = build_skb(buf, buflen);
>  	if (!skb) {
>  		skb = ERR_PTR(-ENOMEM);
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 06/11] tuntap: split out XDP logic
From: Michael S. Tsirkin @ 2018-09-06 17:21 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-7-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:21PM +0800, Jason Wang wrote:
> This patch split out XDP logic into a single function. This make it to
> be reused by XDP batching path in the following patch.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 84 ++++++++++++++++++++++++++++-------------------
>  1 file changed, 51 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 389aa0727cc6..21b125020b3b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1635,6 +1635,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
>  	return true;
>  }
>  
> +static u32 tun_do_xdp(struct tun_struct *tun,
> +		      struct tun_file *tfile,
> +		      struct bpf_prog *xdp_prog,
> +		      struct xdp_buff *xdp,
> +		      int *err)
> +{
> +	u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
> +
> +	switch (act) {
> +	case XDP_REDIRECT:
> +		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
> +		xdp_do_flush_map();
> +		if (*err)
> +			break;
> +		goto out;
> +	case XDP_TX:
> +		*err = tun_xdp_tx(tun->dev, xdp);
> +		if (*err < 0)
> +			break;
> +		*err = 0;
> +		goto out;
> +	case XDP_PASS:
> +		goto out;

Do we need goto? why not just return?

> +	default:
> +		bpf_warn_invalid_xdp_action(act);
> +		/* fall through */
> +	case XDP_ABORTED:
> +		trace_xdp_exception(tun->dev, xdp_prog, act);
> +		/* fall through */
> +	case XDP_DROP:
> +		break;
> +	}
> +
> +	put_page(virt_to_head_page(xdp->data_hard_start));

put here because caller does get_page :( Not pretty.
I'd move this out to the caller.

> +out:
> +	return act;

How about combining err and act? err is < 0 XDP_PASS is > 0.
No need for pointers then.

> +}
> +
>  static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  				     struct tun_file *tfile,
>  				     struct iov_iter *from,
> @@ -1645,10 +1683,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	struct sk_buff *skb = NULL;
>  	struct bpf_prog *xdp_prog;
>  	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -	unsigned int delta = 0;
>  	char *buf;
>  	size_t copied;
> -	int err, pad = TUN_RX_PAD;
> +	int pad = TUN_RX_PAD;
> +	int err = 0;
>  
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(tun->xdp_prog);
> @@ -1685,9 +1723,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	local_bh_disable();
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(tun->xdp_prog);
> -	if (xdp_prog && !*skb_xdp) {
> +	if (xdp_prog) {
>  		struct xdp_buff xdp;
> -		void *orig_data;
>  		u32 act;
>  
>  		xdp.data_hard_start = buf;
> @@ -1695,33 +1732,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  		xdp_set_data_meta_invalid(&xdp);
>  		xdp.data_end = xdp.data + len;
>  		xdp.rxq = &tfile->xdp_rxq;
> -		orig_data = xdp.data;
> -		act = bpf_prog_run_xdp(xdp_prog, &xdp);
> -
> -		switch (act) {
> -		case XDP_REDIRECT:
> -			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
> -			xdp_do_flush_map();
> -			if (err)
> -				goto err_xdp;
> -			goto out;
> -		case XDP_TX:
> -			if (tun_xdp_tx(tun->dev, &xdp) < 0)
> -				goto err_xdp;
> -			goto out;
> -		case XDP_PASS:
> -			delta = orig_data - xdp.data;
> -			len = xdp.data_end - xdp.data;
> -			break;
> -		default:
> -			bpf_warn_invalid_xdp_action(act);
> -			/* fall through */
> -		case XDP_ABORTED:
> -			trace_xdp_exception(tun->dev, xdp_prog, act);
> -			/* fall through */
> -		case XDP_DROP:
> +		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
> +		if (err)
>  			goto err_xdp;
> -		}
> +		if (act != XDP_PASS)
> +			goto out;

likely?

> +
> +		pad = xdp.data - xdp.data_hard_start;
> +		len = xdp.data_end - xdp.data;
>  	}
>  	rcu_read_unlock();
>  	local_bh_enable();
> @@ -1729,18 +1747,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  build:
>  	skb = build_skb(buf, buflen);
>  	if (!skb) {
> +		put_page(alloc_frag->page);
>  		skb = ERR_PTR(-ENOMEM);
>  		goto out;
>  	}
>  
> -	skb_reserve(skb, pad - delta);
> +	skb_reserve(skb, pad);
>  	skb_put(skb, len);
>  
>  	return skb;
>  
>  err_xdp:
> -	alloc_frag->offset -= buflen;
> -	put_page(alloc_frag->page);
> +	this_cpu_inc(tun->pcpu_stats->rx_dropped);


This fixes bug in previous patch which dropped it. OK :)

>  out:
>  	rcu_read_unlock();
>  	local_bh_enable();
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
From: Michael S. Tsirkin @ 2018-09-06 17:48 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-8-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:22PM +0800, Jason Wang wrote:
> This will allow adding batch flushing on top.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 21b125020b3b..ff1cbf3ebd50 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
>  	switch (act) {
>  	case XDP_REDIRECT:
>  		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
> -		xdp_do_flush_map();
>  		if (*err)
>  			break;
>  		goto out;
> @@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
>  		if (err)
>  			goto err_xdp;
> +
> +		if (act == XDP_REDIRECT)
> +			xdp_do_flush_map();
>  		if (act != XDP_PASS)
>  			goto out;

At this point the switch statement which used to contain all XDP things
seems to be gone completely. Just rewrite with a bunch of if statements
and all xdp handling spread out to where it makes sense?

> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
From: Michael S. Tsirkin @ 2018-09-06 17:51 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-10-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. If an XDP program is attached, tuntap can run
> XDP program directly. If not, tuntap will build skb and do a fast
> receiving since part of the work has been done by vhost_net.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Is most of the benefit in batched flushing or skipping
indirect calls? Because if it's flushing we can gain
most of it easily by adding an analog of xmit_more.

> ---
>  drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c839a4bdcbd9..069db2e5dd08 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
>  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>  }
>  
> +static int tun_xdp_one(struct tun_struct *tun,
> +		       struct tun_file *tfile,
> +		       struct xdp_buff *xdp, int *flush)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	struct tun_pcpu_stats *stats;
> +	struct bpf_prog *xdp_prog;
> +	struct sk_buff *skb = NULL;
> +	u32 rxhash = 0, act;
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int err = 0;
> +	bool skb_xdp = false;
> +
> +	xdp_prog = rcu_dereference(tun->xdp_prog);
> +	if (xdp_prog) {
> +		if (gso->gso_type) {
> +			skb_xdp = true;
> +			goto build;
> +		}
> +		xdp_set_data_meta_invalid(xdp);
> +		xdp->rxq = &tfile->xdp_rxq;
> +		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
> +		if (err)
> +			goto out;
> +		if (act == XDP_REDIRECT)
> +			*flush = true;
> +		if (act != XDP_PASS)
> +			goto out;
> +	}
> +
> +build:
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
> +		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
> +		kfree_skb(skb);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	skb->protocol = eth_type_trans(skb, tun->dev);
> +	skb_reset_network_header(skb);
> +	skb_probe_transport_header(skb, 0);
> +
> +	if (skb_xdp) {
> +		err = do_xdp_generic(xdp_prog, skb);
> +		if (err != XDP_PASS)
> +			goto out;
> +	}
> +
> +	if (!rcu_dereference(tun->steering_prog))
> +		rxhash = __skb_get_hash_symmetric(skb);
> +
> +	netif_receive_skb(skb);
> +
> +	stats = get_cpu_ptr(tun->pcpu_stats);
> +	u64_stats_update_begin(&stats->syncp);
> +	stats->rx_packets++;
> +	stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&stats->syncp);
> +	put_cpu_ptr(stats);
> +
> +	if (rxhash)
> +		tun_flow_update(tun, rxhash, tfile);
> +
> +out:
> +	return err;
> +}
> +
>  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	struct tun_struct *tun = tun_get(tfile);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
>  
>  	if (!tun)
>  		return -EBADFD;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		int n = ctl->type >> 16;
> +		int flush = 0;
> +
> +		local_bh_disable();
> +		rcu_read_lock();
> +
> +		for (i = 0; i < n; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tun_xdp_one(tun, tfile, xdp, &flush);
> +		}
> +
> +		if (flush)
> +			xdp_do_flush_map();
> +
> +		rcu_read_unlock();
> +		local_bh_enable();
> +
> +		ret = total_len;
> +		goto out;
> +	}
>  
>  	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			   m->msg_flags & MSG_DONTWAIT,
>  			   m->msg_flags & MSG_MORE);
> +out:
>  	tun_put(tun);
>  	return ret;
>  }
> -- 
> 2.17.1

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox