[RFC PATCH] virtio_net: XDP support for adjust

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] virtio_net: XDP support for adjust_head
@ 2017-01-02 19:44 John Fastabend
  2017-01-02 19:47 ` John Fastabend
  2017-01-03  6:01 ` Jason Wang
  0 siblings, 2 replies; 15+ messages in thread
From: John Fastabend @ 2017-01-02 19:44 UTC (permalink / raw)
  To: jasowang, mst
  Cc: john.r.fastabend, netdev, john.fastabend, alexei.starovoitov,
	daniel

Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

: There is a problem with this patch as is. When xdp prog is loaded
  the old buffers without the 256B headers need to be flushed so that
  the bpf prog has the necessary headroom. This patch does this by
  calling the virtqueue_detach_unused_buf() and followed by the
  virtnet_set_queues() call to reinitialize the buffers. However I
  don't believe this is safe per comment in virtio_ring this API
  is not valid on an active queue and the only thing we have done
  here is napi_disable/napi_enable wrappers which doesn't do anything
  to the emulation layer.

  So the RFC is really to find the best solution to this problem.
  A couple things come to mind, (a) always allocate the necessary
  headroom but this is a bit of a waste (b) add some bit somewhere
  to check if the buffer has headroom but this would mean XDP programs
  would be broke for a cycle through the ring, (c) figure out how
  to deactivate a queue, free the buffers and finally reallocate.
  I think (c) is the best choice for now but I'm not seeing the
  API to do this so virtio/qemu experts anyone know off-hand
  how to make this work? I started looking into the PCI callbacks
  reset() and virtio_device_ready() or possibly hitting the right
  set of bits with vp_set_status() but my first attempt just hung
  the device.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5deeda6..fcc5bd7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -159,6 +159,9 @@ struct virtnet_info {
 	/* Ethtool settings */
 	u8 duplex;
 	u32 speed;
+
+	/* Headroom allocated in RX Queue */
+	unsigned int headroom;
 };
 
 struct padded_vnet_hdr {
@@ -355,6 +358,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 	}
 
 	if (vi->mergeable_rx_bufs) {
+		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
 		/* Zero header and leave csum up to XDP layers */
 		hdr = xdp->data;
 		memset(hdr, 0, vi->hdr_len);
@@ -371,7 +375,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 		num_sg = 2;
 		sg_init_table(sq->sg, 2);
 		sg_set_buf(sq->sg, hdr, vi->hdr_len);
-		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+		skb_to_sgvec(skb, sq->sg + 1, vi->headroom, xdp->data_end - xdp->data);
 	}
 	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
 				   data, GFP_ATOMIC);
@@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 		       struct bpf_prog *xdp_prog,
 		       void *data, int len)
 {
-	int hdr_padded_len;
 	struct xdp_buff xdp;
-	void *buf;
 	unsigned int qp;
 	u32 act;
 
+
 	if (vi->mergeable_rx_bufs) {
-		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-		xdp.data = data + hdr_padded_len;
+		int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+		/* Allow consuming headroom but reserve enough space to push
+		 * the descriptor on if we get an XDP_TX return code.
+		 */
+		xdp.data_hard_start = data - vi->headroom + desc_room;
+		xdp.data = data + desc_room;
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
-		buf = data;
 	} else { /* small buffers */
 		struct sk_buff *skb = data;
 
-		xdp.data = skb->data;
+		xdp.data_hard_start = skb->data;
+		xdp.data = skb->data + vi->headroom;
 		xdp.data_end = xdp.data + len;
-		buf = skb->data;
 	}
 
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 	switch (act) {
 	case XDP_PASS:
+		if (!vi->mergeable_rx_bufs)
+			__skb_pull((struct sk_buff *) data,
+				   xdp.data - xdp.data_hard_start);
 		return XDP_PASS;
 	case XDP_TX:
 		qp = vi->curr_queue_pairs -
 			vi->xdp_queue_pairs +
 			smp_processor_id();
-		xdp.data = buf;
 		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
 		return XDP_TX;
 	default:
@@ -440,7 +449,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 
 	len -= vi->hdr_len;
-	skb_trim(skb, len);
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -463,7 +471,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		}
 	}
 	rcu_read_unlock();
-
+	skb_trim(skb, len);
 	return skb;
 
 err_xdp:
@@ -772,20 +780,21 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 			     gfp_t gfp)
 {
+	int headroom = GOOD_PACKET_LEN + vi->headroom;
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	int err;
 
-	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
+	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
-	skb_put(skb, GOOD_PACKET_LEN);
+	skb_put(skb, headroom);
 
 	hdr = skb_vnet_hdr(skb);
 	sg_init_table(rq->sg, 2);
 	sg_set_buf(rq->sg, hdr, vi->hdr_len);
-	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, vi->headroom, skb->len - vi->headroom);
 
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
 	if (err < 0)
@@ -853,24 +862,27 @@ static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
 	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
 }
 
-static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi,
+				 struct receive_queue *rq, gfp_t gfp)
 {
 	struct page_frag *alloc_frag = &rq->alloc_frag;
+	unsigned int headroom = vi->headroom;
 	char *buf;
 	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
 	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
-	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
 		return -ENOMEM;
 
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	buf += headroom; /* advance address leaving hole at front of pkt */
 	ctx = mergeable_buf_to_ctx(buf, len);
 	get_page(alloc_frag->page);
-	alloc_frag->offset += len;
+	alloc_frag->offset += len + headroom;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < len) {
+	if (hole < len + headroom) {
 		/* To avoid internal fragmentation, if there is very likely not
 		 * enough space for another buffer, add the remaining space to
 		 * the current buffer. This extra space is not included in
@@ -904,7 +916,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(rq, gfp);
+			err = add_recvbuf_mergeable(vi, rq, gfp);
 		else if (vi->big_packets)
 			err = add_recvbuf_big(vi, rq, gfp);
 		else
@@ -1699,12 +1711,15 @@ static void virtnet_init_settings(struct net_device *dev)
 	.set_settings = virtnet_set_settings,
 };
 
+#define VIRTIO_XDP_HEADROOM 256
+
 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct bpf_prog *old_prog;
 	u16 xdp_qp = 0, curr_qp;
+	unsigned int old_hr;
 	int i, err;
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1736,19 +1751,58 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 		return -ENOMEM;
 	}
 
+	old_hr = vi->headroom;
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+		vi->headroom = VIRTIO_XDP_HEADROOM;
+	} else {
+		vi->headroom = 0;
+	}
+
+	/* Changing the headroom in buffers is a disruptive operation because
+	 * existing buffers must be flushed and reallocated. This will happen
+	 * when a xdp program is initially added or xdp is disabled by removing
+	 * the xdp program.
+	 */
+	if (old_hr != vi->headroom) {
+		cancel_delayed_work_sync(&vi->refill);
+		if (netif_running(vi->dev)) {
+			for (i = 0; i < vi->max_queue_pairs; i++)
+				napi_disable(&vi->rq[i].napi);
+		}
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			struct virtqueue *vq = vi->rq[i].vq;
+			void *buf;
+
+			while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+				if (vi->mergeable_rx_bufs) {
+					unsigned long ctx = (unsigned long)buf;
+					void *base = mergeable_ctx_to_buf_address(ctx);
+					put_page(virt_to_head_page(base));
+				} else if (vi->big_packets) {
+					give_pages(&vi->rq[i], buf);
+				} else {
+					dev_kfree_skb(buf);
+				}
+			}
+		}
+		if (netif_running(vi->dev)) {
+			for (i = 0; i < vi->max_queue_pairs; i++)
+				virtnet_napi_enable(&vi->rq[i]);
+		}
+	}
+
 	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
 	if (err) {
 		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
+		vi->headroom = old_hr;
+		if (prog)
+			bpf_prog_sub(prog, vi->max_queue_pairs - 1);
 		return err;
 	}
 
-	if (prog) {
-		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-		if (IS_ERR(prog)) {
-			virtnet_set_queues(vi, curr_qp);
-			return PTR_ERR(prog);
-		}
-	}
 
 	vi->xdp_queue_pairs = xdp_qp;
 	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-02 19:44 [RFC PATCH] virtio_net: XDP support for adjust_head John Fastabend
@ 2017-01-02 19:47 ` John Fastabend
  2017-01-03  6:01 ` Jason Wang
  1 sibling, 0 replies; 15+ messages in thread
From: John Fastabend @ 2017-01-02 19:47 UTC (permalink / raw)
  To: jasowang, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel

On 17-01-02 11:44 AM, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
> 
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
> 
> : There is a problem with this patch as is. When xdp prog is loaded
>   the old buffers without the 256B headers need to be flushed so that
>   the bpf prog has the necessary headroom. This patch does this by
>   calling the virtqueue_detach_unused_buf() and followed by the
>   virtnet_set_queues() call to reinitialize the buffers. However I
>   don't believe this is safe per comment in virtio_ring this API
>   is not valid on an active queue and the only thing we have done
>   here is napi_disable/napi_enable wrappers which doesn't do anything
>   to the emulation layer.
> 
>   So the RFC is really to find the best solution to this problem.
>   A couple things come to mind, (a) always allocate the necessary
>   headroom but this is a bit of a waste (b) add some bit somewhere
>   to check if the buffer has headroom but this would mean XDP programs
>   would be broke for a cycle through the ring, (c) figure out how
>   to deactivate a queue, free the buffers and finally reallocate.
>   I think (c) is the best choice for now but I'm not seeing the
>   API to do this so virtio/qemu experts anyone know off-hand
>   how to make this work? I started looking into the PCI callbacks
>   reset() and virtio_device_ready() or possibly hitting the right
>   set of bits with vp_set_status() but my first attempt just hung
>   the device.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---


[...]

> +
> +	/* Changing the headroom in buffers is a disruptive operation because
> +	 * existing buffers must be flushed and reallocated. This will happen
> +	 * when a xdp program is initially added or xdp is disabled by removing
> +	 * the xdp program.
> +	 */
> +	if (old_hr != vi->headroom) {
> +		cancel_delayed_work_sync(&vi->refill);
> +		if (netif_running(vi->dev)) {
> +			for (i = 0; i < vi->max_queue_pairs; i++)
> +				napi_disable(&vi->rq[i].napi);
> +		}
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			struct virtqueue *vq = vi->rq[i].vq;
> +			void *buf;
> +
> +			while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Per API in virtio_ring.c queue must be deactivated to call this.


> +				if (vi->mergeable_rx_bufs) {
> +					unsigned long ctx = (unsigned long)buf;
> +					void *base = mergeable_ctx_to_buf_address(ctx);
> +					put_page(virt_to_head_page(base));
> +				} else if (vi->big_packets) {
> +					give_pages(&vi->rq[i], buf);
> +				} else {
> +					dev_kfree_skb(buf);
> +				}
> +			}
> +		}
> +		if (netif_running(vi->dev)) {
> +			for (i = 0; i < vi->max_queue_pairs; i++)
> +				virtnet_napi_enable(&vi->rq[i]);
> +		}
> +	}
> +

Hi Jason, Michael,

Any hints on how to solve the above kludge to flush the existing ring and
reallocate with correct headroom? The above doesn't look safe to me per commit
message.

Thanks!
John

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-02 19:44 [RFC PATCH] virtio_net: XDP support for adjust_head John Fastabend
  2017-01-02 19:47 ` John Fastabend
@ 2017-01-03  6:01 ` Jason Wang
  2017-01-03 16:54   ` John Fastabend
                     ` (2 more replies)
  1 sibling, 3 replies; 15+ messages in thread
From: Jason Wang @ 2017-01-03  6:01 UTC (permalink / raw)
  To: John Fastabend, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel



On 2017年01月03日 03:44, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
>
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
>
> : There is a problem with this patch as is. When xdp prog is loaded
>    the old buffers without the 256B headers need to be flushed so that
>    the bpf prog has the necessary headroom. This patch does this by
>    calling the virtqueue_detach_unused_buf() and followed by the
>    virtnet_set_queues() call to reinitialize the buffers. However I
>    don't believe this is safe per comment in virtio_ring this API
>    is not valid on an active queue and the only thing we have done
>    here is napi_disable/napi_enable wrappers which doesn't do anything
>    to the emulation layer.
>
>    So the RFC is really to find the best solution to this problem.
>    A couple things come to mind, (a) always allocate the necessary
>    headroom but this is a bit of a waste (b) add some bit somewhere
>    to check if the buffer has headroom but this would mean XDP programs
>    would be broke for a cycle through the ring, (c) figure out how
>    to deactivate a queue, free the buffers and finally reallocate.
>    I think (c) is the best choice for now but I'm not seeing the
>    API to do this so virtio/qemu experts anyone know off-hand
>    how to make this work? I started looking into the PCI callbacks
>    reset() and virtio_device_ready() or possibly hitting the right
>    set of bits with vp_set_status() but my first attempt just hung
>    the device.

Hi John:

AFAIK, disabling a specific queue was supported only by virtio 1.0 
through queue_enable field in pci common cfg. But unfortunately, qemu 
does not emulate this at all and legacy device does not even support 
this. So the safe way is probably reset the device and redo the 
initialization here.

>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>   drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++++++++-----------
>   1 file changed, 80 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 5deeda6..fcc5bd7 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -159,6 +159,9 @@ struct virtnet_info {
>   	/* Ethtool settings */
>   	u8 duplex;
>   	u32 speed;
> +
> +	/* Headroom allocated in RX Queue */
> +	unsigned int headroom;
>   };
>   
>   struct padded_vnet_hdr {
> @@ -355,6 +358,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>   	}
>   
>   	if (vi->mergeable_rx_bufs) {
> +		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>   		/* Zero header and leave csum up to XDP layers */
>   		hdr = xdp->data;
>   		memset(hdr, 0, vi->hdr_len);
> @@ -371,7 +375,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>   		num_sg = 2;
>   		sg_init_table(sq->sg, 2);
>   		sg_set_buf(sq->sg, hdr, vi->hdr_len);
> -		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
> +		skb_to_sgvec(skb, sq->sg + 1, vi->headroom, xdp->data_end - xdp->data);

vi->headroom look suspicious, should it be xdp->data - xdp->data_hard_start?

>   	}
>   	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>   				   data, GFP_ATOMIC);
> @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>   		       struct bpf_prog *xdp_prog,
>   		       void *data, int len)
>   {
> -	int hdr_padded_len;
>   	struct xdp_buff xdp;
> -	void *buf;
>   	unsigned int qp;
>   	u32 act;
>   
> +
>   	if (vi->mergeable_rx_bufs) {
> -		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> -		xdp.data = data + hdr_padded_len;
> +		int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +
> +		/* Allow consuming headroom but reserve enough space to push
> +		 * the descriptor on if we get an XDP_TX return code.
> +		 */
> +		xdp.data_hard_start = data - vi->headroom + desc_room;
> +		xdp.data = data + desc_room;
>   		xdp.data_end = xdp.data + (len - vi->hdr_len);
> -		buf = data;
>   	} else { /* small buffers */
>   		struct sk_buff *skb = data;
>   
> -		xdp.data = skb->data;
> +		xdp.data_hard_start = skb->data;
> +		xdp.data = skb->data + vi->headroom;
>   		xdp.data_end = xdp.data + len;
> -		buf = skb->data;
>   	}
>   
>   	act = bpf_prog_run_xdp(xdp_prog, &xdp);
>   	switch (act) {
>   	case XDP_PASS:
> +		if (!vi->mergeable_rx_bufs)
> +			__skb_pull((struct sk_buff *) data,
> +				   xdp.data - xdp.data_hard_start);

Instead of doing things here and virtnet_xdp_xmit(). How about always 
making skb->data point to the buffer head like:

1) reserve headroom in add_recvbuf_small()
2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data 
was modified afer bpf_prog_run_xdp()

Then there's no special code in either XDP_PASS or XDP_TX?

>   		return XDP_PASS;
>   	case XDP_TX:
>   		qp = vi->curr_queue_pairs -
>   			vi->xdp_queue_pairs +
>   			smp_processor_id();

[...]

> +#define VIRTIO_XDP_HEADROOM 256
> +
>   static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>   {
>   	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>   	struct virtnet_info *vi = netdev_priv(dev);
>   	struct bpf_prog *old_prog;
>   	u16 xdp_qp = 0, curr_qp;
> +	unsigned int old_hr;
>   	int i, err;
>   
>   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -1736,19 +1751,58 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>   		return -ENOMEM;
>   	}
>   
> +	old_hr = vi->headroom;
> +	if (prog) {
> +		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> +		if (IS_ERR(prog))
> +			return PTR_ERR(prog);
> +		vi->headroom = VIRTIO_XDP_HEADROOM;
> +	} else {
> +		vi->headroom = 0;
> +	}
> +
> +	/* Changing the headroom in buffers is a disruptive operation because
> +	 * existing buffers must be flushed and reallocated. This will happen
> +	 * when a xdp program is initially added or xdp is disabled by removing
> +	 * the xdp program.
> +	 */

We probably need reset the device here, but maybe Michale has more 
ideas. And if we do this, another interesting thing to do is to disable 
EWMA and always use a single page for each packet, this could almost 
eliminate linearizing.

Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03  6:01 ` Jason Wang
@ 2017-01-03 16:54   ` John Fastabend
  2017-01-03 16:57     ` John Fastabend
  2017-01-04  3:21     ` Jason Wang
  2017-01-03 22:16   ` Michael S. Tsirkin
  2017-01-04 18:58   ` John Fastabend
  2 siblings, 2 replies; 15+ messages in thread
From: John Fastabend @ 2017-01-03 16:54 UTC (permalink / raw)
  To: Jason Wang, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel

On 17-01-02 10:01 PM, Jason Wang wrote:
> 
> 
> On 2017年01月03日 03:44, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> : There is a problem with this patch as is. When xdp prog is loaded
>>    the old buffers without the 256B headers need to be flushed so that
>>    the bpf prog has the necessary headroom. This patch does this by
>>    calling the virtqueue_detach_unused_buf() and followed by the
>>    virtnet_set_queues() call to reinitialize the buffers. However I
>>    don't believe this is safe per comment in virtio_ring this API
>>    is not valid on an active queue and the only thing we have done
>>    here is napi_disable/napi_enable wrappers which doesn't do anything
>>    to the emulation layer.
>>
>>    So the RFC is really to find the best solution to this problem.
>>    A couple things come to mind, (a) always allocate the necessary
>>    headroom but this is a bit of a waste (b) add some bit somewhere
>>    to check if the buffer has headroom but this would mean XDP programs
>>    would be broke for a cycle through the ring, (c) figure out how
>>    to deactivate a queue, free the buffers and finally reallocate.
>>    I think (c) is the best choice for now but I'm not seeing the
>>    API to do this so virtio/qemu experts anyone know off-hand
>>    how to make this work? I started looking into the PCI callbacks
>>    reset() and virtio_device_ready() or possibly hitting the right
>>    set of bits with vp_set_status() but my first attempt just hung
>>    the device.
> 
> Hi John:
> 
> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
> queue_enable field in pci common cfg. But unfortunately, qemu does not emulate
> this at all and legacy device does not even support this. So the safe way is
> probably reset the device and redo the initialization here.
> 

OK, I'll draft up a fix with a full reset unless Michael has some idea in
the meantime.

>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>   drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 80 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 5deeda6..fcc5bd7 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -159,6 +159,9 @@ struct virtnet_info {
>>       /* Ethtool settings */
>>       u8 duplex;
>>       u32 speed;
>> +
>> +    /* Headroom allocated in RX Queue */
>> +    unsigned int headroom;
>>   };
>>     struct padded_vnet_hdr {
>> @@ -355,6 +358,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>       }
>>         if (vi->mergeable_rx_bufs) {
>> +        xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>           /* Zero header and leave csum up to XDP layers */
>>           hdr = xdp->data;
>>           memset(hdr, 0, vi->hdr_len);
>> @@ -371,7 +375,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>           num_sg = 2;
>>           sg_init_table(sq->sg, 2);
>>           sg_set_buf(sq->sg, hdr, vi->hdr_len);
>> -        skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
>> +        skb_to_sgvec(skb, sq->sg + 1, vi->headroom, xdp->data_end - xdp->data);
> 
> vi->headroom look suspicious, should it be xdp->data - xdp->data_hard_start?
> 

Yep found this as well while testing small packet receive.

>>       }
>>       err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>>                      data, GFP_ATOMIC);
>> @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>>                  struct bpf_prog *xdp_prog,
>>                  void *data, int len)
>>   {
>> -    int hdr_padded_len;
>>       struct xdp_buff xdp;
>> -    void *buf;
>>       unsigned int qp;
>>       u32 act;
>>   +
>>       if (vi->mergeable_rx_bufs) {
>> -        hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>> -        xdp.data = data + hdr_padded_len;
>> +        int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>> +
>> +        /* Allow consuming headroom but reserve enough space to push
>> +         * the descriptor on if we get an XDP_TX return code.
>> +         */
>> +        xdp.data_hard_start = data - vi->headroom + desc_room;
>> +        xdp.data = data + desc_room;
>>           xdp.data_end = xdp.data + (len - vi->hdr_len);
>> -        buf = data;
>>       } else { /* small buffers */
>>           struct sk_buff *skb = data;
>>   -        xdp.data = skb->data;
>> +        xdp.data_hard_start = skb->data;
>> +        xdp.data = skb->data + vi->headroom;
>>           xdp.data_end = xdp.data + len;
>> -        buf = skb->data;
>>       }
>>         act = bpf_prog_run_xdp(xdp_prog, &xdp);
>>       switch (act) {
>>       case XDP_PASS:
>> +        if (!vi->mergeable_rx_bufs)
>> +            __skb_pull((struct sk_buff *) data,
>> +                   xdp.data - xdp.data_hard_start);
> 
> Instead of doing things here and virtnet_xdp_xmit(). How about always making
> skb->data point to the buffer head like:
> 
> 1) reserve headroom in add_recvbuf_small()
> 2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data was
> modified afer bpf_prog_run_xdp()
> 
> Then there's no special code in either XDP_PASS or XDP_TX?
> 

Sure. Works for me and avoids if/else code.

>>           return XDP_PASS;
>>       case XDP_TX:
>>           qp = vi->curr_queue_pairs -
>>               vi->xdp_queue_pairs +
>>               smp_processor_id();
> 
> [...]
> 
>> +#define VIRTIO_XDP_HEADROOM 256
>> +
>>   static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>>   {
>>       unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>>       struct virtnet_info *vi = netdev_priv(dev);
>>       struct bpf_prog *old_prog;
>>       u16 xdp_qp = 0, curr_qp;
>> +    unsigned int old_hr;
>>       int i, err;
>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
>> @@ -1736,19 +1751,58 @@ static int virtnet_xdp_set(struct net_device *dev,
>> struct bpf_prog *prog)
>>           return -ENOMEM;
>>       }
>>   +    old_hr = vi->headroom;
>> +    if (prog) {
>> +        prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
>> +        if (IS_ERR(prog))
>> +            return PTR_ERR(prog);
>> +        vi->headroom = VIRTIO_XDP_HEADROOM;
>> +    } else {
>> +        vi->headroom = 0;
>> +    }
>> +
>> +    /* Changing the headroom in buffers is a disruptive operation because
>> +     * existing buffers must be flushed and reallocated. This will happen
>> +     * when a xdp program is initially added or xdp is disabled by removing
>> +     * the xdp program.
>> +     */
> 
> We probably need reset the device here, but maybe Michale has more ideas. And if
> we do this, another interesting thing to do is to disable EWMA and always use a
> single page for each packet, this could almost eliminate linearizing.

Well with normal MTU 1500 size we should not hit the linearizing case right? The
question is should we cap the MTU at GOOD_PACKET_LEN vs the current cap of
(PAGE_SIZE - overhead).

> 
> Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03 16:54   ` John Fastabend
@ 2017-01-03 16:57     ` John Fastabend
  2017-01-04  3:22       ` Jason Wang
  2017-01-04  3:21     ` Jason Wang
  1 sibling, 1 reply; 15+ messages in thread
From: John Fastabend @ 2017-01-03 16:57 UTC (permalink / raw)
  To: Jason Wang, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel

On 17-01-03 08:54 AM, John Fastabend wrote:
> On 17-01-02 10:01 PM, Jason Wang wrote:
>>
>>
>> On 2017年01月03日 03:44, John Fastabend wrote:
>>> Add support for XDP adjust head by allocating a 256B header region
>>> that XDP programs can grow into. This is only enabled when a XDP
>>> program is loaded.
>>>
>>> In order to ensure that we do not have to unwind queue headroom push
>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>> unwind vs another queue setup call.
>>>
>>> : There is a problem with this patch as is. When xdp prog is loaded
>>>    the old buffers without the 256B headers need to be flushed so that
>>>    the bpf prog has the necessary headroom. This patch does this by
>>>    calling the virtqueue_detach_unused_buf() and followed by the
>>>    virtnet_set_queues() call to reinitialize the buffers. However I
>>>    don't believe this is safe per comment in virtio_ring this API
>>>    is not valid on an active queue and the only thing we have done
>>>    here is napi_disable/napi_enable wrappers which doesn't do anything
>>>    to the emulation layer.
>>>
>>>    So the RFC is really to find the best solution to this problem.
>>>    A couple things come to mind, (a) always allocate the necessary
>>>    headroom but this is a bit of a waste (b) add some bit somewhere
>>>    to check if the buffer has headroom but this would mean XDP programs
>>>    would be broke for a cycle through the ring, (c) figure out how
>>>    to deactivate a queue, free the buffers and finally reallocate.
>>>    I think (c) is the best choice for now but I'm not seeing the
>>>    API to do this so virtio/qemu experts anyone know off-hand
>>>    how to make this work? I started looking into the PCI callbacks
>>>    reset() and virtio_device_ready() or possibly hitting the right
>>>    set of bits with vp_set_status() but my first attempt just hung
>>>    the device.
>>
>> Hi John:
>>
>> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
>> queue_enable field in pci common cfg. But unfortunately, qemu does not emulate
>> this at all and legacy device does not even support this. So the safe way is
>> probably reset the device and redo the initialization here.
>>
> 
> OK, I'll draft up a fix with a full reset unless Michael has some idea in
> the meantime.
> 
>>>
>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>>> ---
>>>   drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++++++++-----------
>>>   1 file changed, 80 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 5deeda6..fcc5bd7 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -159,6 +159,9 @@ struct virtnet_info {
>>>       /* Ethtool settings */
>>>       u8 duplex;
>>>       u32 speed;
>>> +
>>> +    /* Headroom allocated in RX Queue */
>>> +    unsigned int headroom;
>>>   };
>>>     struct padded_vnet_hdr {
>>> @@ -355,6 +358,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>>       }
>>>         if (vi->mergeable_rx_bufs) {
>>> +        xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>>           /* Zero header and leave csum up to XDP layers */
>>>           hdr = xdp->data;
>>>           memset(hdr, 0, vi->hdr_len);
>>> @@ -371,7 +375,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>>           num_sg = 2;
>>>           sg_init_table(sq->sg, 2);
>>>           sg_set_buf(sq->sg, hdr, vi->hdr_len);
>>> -        skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
>>> +        skb_to_sgvec(skb, sq->sg + 1, vi->headroom, xdp->data_end - xdp->data);
>>
>> vi->headroom look suspicious, should it be xdp->data - xdp->data_hard_start?
>>
> 
> Yep found this as well while testing small packet receive.
> 
>>>       }
>>>       err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>>>                      data, GFP_ATOMIC);
>>> @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>>>                  struct bpf_prog *xdp_prog,
>>>                  void *data, int len)
>>>   {
>>> -    int hdr_padded_len;
>>>       struct xdp_buff xdp;
>>> -    void *buf;
>>>       unsigned int qp;
>>>       u32 act;
>>>   +
>>>       if (vi->mergeable_rx_bufs) {
>>> -        hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>> -        xdp.data = data + hdr_padded_len;
>>> +        int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>> +
>>> +        /* Allow consuming headroom but reserve enough space to push
>>> +         * the descriptor on if we get an XDP_TX return code.
>>> +         */
>>> +        xdp.data_hard_start = data - vi->headroom + desc_room;
>>> +        xdp.data = data + desc_room;
>>>           xdp.data_end = xdp.data + (len - vi->hdr_len);
>>> -        buf = data;
>>>       } else { /* small buffers */
>>>           struct sk_buff *skb = data;
>>>   -        xdp.data = skb->data;
>>> +        xdp.data_hard_start = skb->data;
>>> +        xdp.data = skb->data + vi->headroom;
>>>           xdp.data_end = xdp.data + len;
>>> -        buf = skb->data;
>>>       }
>>>         act = bpf_prog_run_xdp(xdp_prog, &xdp);
>>>       switch (act) {
>>>       case XDP_PASS:
>>> +        if (!vi->mergeable_rx_bufs)
>>> +            __skb_pull((struct sk_buff *) data,
>>> +                   xdp.data - xdp.data_hard_start);
>>
>> Instead of doing things here and virtnet_xdp_xmit(). How about always making
>> skb->data point to the buffer head like:
>>
>> 1) reserve headroom in add_recvbuf_small()
>> 2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data was
>> modified afer bpf_prog_run_xdp()
>>
>> Then there's no special code in either XDP_PASS or XDP_TX?
>>
> 
> Sure. Works for me and avoids if/else code.
> 
>>>           return XDP_PASS;
>>>       case XDP_TX:
>>>           qp = vi->curr_queue_pairs -
>>>               vi->xdp_queue_pairs +
>>>               smp_processor_id();
>>
>> [...]
>>
>>> +#define VIRTIO_XDP_HEADROOM 256
>>> +
>>>   static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>>>   {
>>>       unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>>>       struct virtnet_info *vi = netdev_priv(dev);
>>>       struct bpf_prog *old_prog;
>>>       u16 xdp_qp = 0, curr_qp;
>>> +    unsigned int old_hr;
>>>       int i, err;
>>>         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
>>> @@ -1736,19 +1751,58 @@ static int virtnet_xdp_set(struct net_device *dev,
>>> struct bpf_prog *prog)
>>>           return -ENOMEM;
>>>       }
>>>   +    old_hr = vi->headroom;
>>> +    if (prog) {
>>> +        prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
>>> +        if (IS_ERR(prog))
>>> +            return PTR_ERR(prog);
>>> +        vi->headroom = VIRTIO_XDP_HEADROOM;
>>> +    } else {
>>> +        vi->headroom = 0;
>>> +    }
>>> +
>>> +    /* Changing the headroom in buffers is a disruptive operation because
>>> +     * existing buffers must be flushed and reallocated. This will happen
>>> +     * when a xdp program is initially added or xdp is disabled by removing
>>> +     * the xdp program.
>>> +     */
>>
>> We probably need reset the device here, but maybe Michale has more ideas. And if
>> we do this, another interesting thing to do is to disable EWMA and always use a
>> single page for each packet, this could almost eliminate linearizing.
> 
> Well with normal MTU 1500 size we should not hit the linearizing case right? The
> question is should we cap the MTU at GOOD_PACKET_LEN vs the current cap of
> (PAGE_SIZE - overhead).

Sorry responding to my own post with a bit more detail. I don't really like
going to a page for each packet because we end up with double the pages in use
for the "normal" 1500 MTU case. We could make the xdp allocation scheme smarter
and allocate a page per packet when MTU is greater than 2k instead of using the
EWMA but I would push those types of things at net-next and live with the
linearizing behavior for now or capping the MTU.

> 
>>
>> Thanks
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03 16:57     ` John Fastabend
@ 2017-01-04  3:22       ` Jason Wang
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Wang @ 2017-01-04  3:22 UTC (permalink / raw)
  To: John Fastabend, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel



On 2017年01月04日 00:57, John Fastabend wrote:
>>>> +    /* Changing the headroom in buffers is a disruptive operation because
>>>> +     * existing buffers must be flushed and reallocated. This will happen
>>>> +     * when a xdp program is initially added or xdp is disabled by removing
>>>> +     * the xdp program.
>>>> +     */
>>> We probably need reset the device here, but maybe Michale has more ideas. And if
>>> we do this, another interesting thing to do is to disable EWMA and always use a
>>> single page for each packet, this could almost eliminate linearizing.
>> Well with normal MTU 1500 size we should not hit the linearizing case right? The
>> question is should we cap the MTU at GOOD_PACKET_LEN vs the current cap of
>> (PAGE_SIZE - overhead).
> Sorry responding to my own post with a bit more detail. I don't really like
> going to a page for each packet because we end up with double the pages in use
> for the "normal" 1500 MTU case. We could make the xdp allocation scheme smarter
> and allocate a page per packet when MTU is greater than 2k instead of using the
> EWMA but I would push those types of things at net-next and live with the
> linearizing behavior for now or capping the MTU.
>

Yes, agree.

Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03 16:54   ` John Fastabend
  2017-01-03 16:57     ` John Fastabend
@ 2017-01-04  3:21     ` Jason Wang
  1 sibling, 0 replies; 15+ messages in thread
From: Jason Wang @ 2017-01-04  3:21 UTC (permalink / raw)
  To: John Fastabend, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel



On 2017年01月04日 00:54, John Fastabend wrote:
>>> +    /* Changing the headroom in buffers is a disruptive operation because
>>> +     * existing buffers must be flushed and reallocated. This will happen
>>> +     * when a xdp program is initially added or xdp is disabled by removing
>>> +     * the xdp program.
>>> +     */
>> We probably need reset the device here, but maybe Michale has more ideas. And if
>> we do this, another interesting thing to do is to disable EWMA and always use a
>> single page for each packet, this could almost eliminate linearizing.
> Well with normal MTU 1500 size we should not hit the linearizing case right?

My reply may be not clear, for 1500 I mean for small buffer only.

Thanks

>   The
> question is should we cap the MTU at GOOD_PACKET_LEN vs the current cap of
> (PAGE_SIZE - overhead).
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03  6:01 ` Jason Wang
  2017-01-03 16:54   ` John Fastabend
@ 2017-01-03 22:16   ` Michael S. Tsirkin
  2017-01-05 22:57     ` John Fastabend
  2017-01-04 18:58   ` John Fastabend
  2 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2017-01-03 22:16 UTC (permalink / raw)
  To: Jason Wang
  Cc: John Fastabend, john.r.fastabend, netdev, alexei.starovoitov,
	daniel

On Tue, Jan 03, 2017 at 02:01:27PM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月03日 03:44, John Fastabend wrote:
> > Add support for XDP adjust head by allocating a 256B header region
> > that XDP programs can grow into. This is only enabled when a XDP
> > program is loaded.
> > 
> > In order to ensure that we do not have to unwind queue headroom push
> > queue setup below bpf_prog_add. It reads better to do a prog ref
> > unwind vs another queue setup call.
> > 
> > : There is a problem with this patch as is. When xdp prog is loaded
> >    the old buffers without the 256B headers need to be flushed so that
> >    the bpf prog has the necessary headroom. This patch does this by
> >    calling the virtqueue_detach_unused_buf() and followed by the
> >    virtnet_set_queues() call to reinitialize the buffers. However I
> >    don't believe this is safe per comment in virtio_ring this API
> >    is not valid on an active queue and the only thing we have done
> >    here is napi_disable/napi_enable wrappers which doesn't do anything
> >    to the emulation layer.
> > 
> >    So the RFC is really to find the best solution to this problem.
> >    A couple things come to mind, (a) always allocate the necessary
> >    headroom but this is a bit of a waste (b) add some bit somewhere
> >    to check if the buffer has headroom but this would mean XDP programs
> >    would be broke for a cycle through the ring, (c) figure out how
> >    to deactivate a queue, free the buffers and finally reallocate.
> >    I think (c) is the best choice for now but I'm not seeing the
> >    API to do this so virtio/qemu experts anyone know off-hand
> >    how to make this work? I started looking into the PCI callbacks
> >    reset() and virtio_device_ready() or possibly hitting the right
> >    set of bits with vp_set_status() but my first attempt just hung
> >    the device.
> 
> Hi John:
> 
> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
> queue_enable field in pci common cfg.

In fact 1.0 only allows enabling queues selectively.
We can add disabling by a spec enhancement but
for now reset is the only way.


> But unfortunately, qemu does not
> emulate this at all and legacy device does not even support this. So the
> safe way is probably reset the device and redo the initialization here.

You will also have to re-apply rx filtering if you do this.
Probably sending notification uplink.

> > 
> > Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> > ---
> >   drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++++++++-----------
> >   1 file changed, 80 insertions(+), 26 deletions(-)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 5deeda6..fcc5bd7 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -159,6 +159,9 @@ struct virtnet_info {
> >   	/* Ethtool settings */
> >   	u8 duplex;
> >   	u32 speed;
> > +
> > +	/* Headroom allocated in RX Queue */
> > +	unsigned int headroom;
> >   };
> >   struct padded_vnet_hdr {
> > @@ -355,6 +358,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
> >   	}
> >   	if (vi->mergeable_rx_bufs) {
> > +		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >   		/* Zero header and leave csum up to XDP layers */
> >   		hdr = xdp->data;
> >   		memset(hdr, 0, vi->hdr_len);
> > @@ -371,7 +375,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
> >   		num_sg = 2;
> >   		sg_init_table(sq->sg, 2);
> >   		sg_set_buf(sq->sg, hdr, vi->hdr_len);
> > -		skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
> > +		skb_to_sgvec(skb, sq->sg + 1, vi->headroom, xdp->data_end - xdp->data);
> 
> vi->headroom look suspicious, should it be xdp->data - xdp->data_hard_start?
> 
> >   	}
> >   	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> >   				   data, GFP_ATOMIC);
> > @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
> >   		       struct bpf_prog *xdp_prog,
> >   		       void *data, int len)
> >   {
> > -	int hdr_padded_len;
> >   	struct xdp_buff xdp;
> > -	void *buf;
> >   	unsigned int qp;
> >   	u32 act;
> > +
> >   	if (vi->mergeable_rx_bufs) {
> > -		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> > -		xdp.data = data + hdr_padded_len;
> > +		int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> > +
> > +		/* Allow consuming headroom but reserve enough space to push
> > +		 * the descriptor on if we get an XDP_TX return code.
> > +		 */
> > +		xdp.data_hard_start = data - vi->headroom + desc_room;
> > +		xdp.data = data + desc_room;
> >   		xdp.data_end = xdp.data + (len - vi->hdr_len);
> > -		buf = data;
> >   	} else { /* small buffers */
> >   		struct sk_buff *skb = data;
> > -		xdp.data = skb->data;
> > +		xdp.data_hard_start = skb->data;
> > +		xdp.data = skb->data + vi->headroom;
> >   		xdp.data_end = xdp.data + len;
> > -		buf = skb->data;
> >   	}
> >   	act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >   	switch (act) {
> >   	case XDP_PASS:
> > +		if (!vi->mergeable_rx_bufs)
> > +			__skb_pull((struct sk_buff *) data,
> > +				   xdp.data - xdp.data_hard_start);
> 
> Instead of doing things here and virtnet_xdp_xmit(). How about always making
> skb->data point to the buffer head like:
> 
> 1) reserve headroom in add_recvbuf_small()
> 2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data was
> modified afer bpf_prog_run_xdp()
> 
> Then there's no special code in either XDP_PASS or XDP_TX?
> 
> >   		return XDP_PASS;
> >   	case XDP_TX:
> >   		qp = vi->curr_queue_pairs -
> >   			vi->xdp_queue_pairs +
> >   			smp_processor_id();
> 
> [...]
> 
> > +#define VIRTIO_XDP_HEADROOM 256
> > +
> >   static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> >   {
> >   	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
> >   	struct virtnet_info *vi = netdev_priv(dev);
> >   	struct bpf_prog *old_prog;
> >   	u16 xdp_qp = 0, curr_qp;
> > +	unsigned int old_hr;
> >   	int i, err;
> >   	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> > @@ -1736,19 +1751,58 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
> >   		return -ENOMEM;
> >   	}
> > +	old_hr = vi->headroom;
> > +	if (prog) {
> > +		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> > +		if (IS_ERR(prog))
> > +			return PTR_ERR(prog);
> > +		vi->headroom = VIRTIO_XDP_HEADROOM;
> > +	} else {
> > +		vi->headroom = 0;
> > +	}
> > +
> > +	/* Changing the headroom in buffers is a disruptive operation because
> > +	 * existing buffers must be flushed and reallocated. This will happen
> > +	 * when a xdp program is initially added or xdp is disabled by removing
> > +	 * the xdp program.
> > +	 */
> 
> We probably need reset the device here, but maybe Michale has more ideas.
> And if we do this, another interesting thing to do is to disable EWMA and
> always use a single page for each packet, this could almost eliminate
> linearizing.
> 
> Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03 22:16   ` Michael S. Tsirkin
@ 2017-01-05 22:57     ` John Fastabend
  2017-01-06  0:39       ` Michael S. Tsirkin
  0 siblings, 1 reply; 15+ messages in thread
From: John Fastabend @ 2017-01-05 22:57 UTC (permalink / raw)
  To: Michael S. Tsirkin, Jason Wang
  Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel

On 17-01-03 02:16 PM, Michael S. Tsirkin wrote:
> On Tue, Jan 03, 2017 at 02:01:27PM +0800, Jason Wang wrote:
>>
>>
>> On 2017年01月03日 03:44, John Fastabend wrote:
>>> Add support for XDP adjust head by allocating a 256B header region
>>> that XDP programs can grow into. This is only enabled when a XDP
>>> program is loaded.
>>>
>>> In order to ensure that we do not have to unwind queue headroom push
>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>> unwind vs another queue setup call.
>>>
>>> : There is a problem with this patch as is. When xdp prog is loaded
>>>    the old buffers without the 256B headers need to be flushed so that
>>>    the bpf prog has the necessary headroom. This patch does this by
>>>    calling the virtqueue_detach_unused_buf() and followed by the
>>>    virtnet_set_queues() call to reinitialize the buffers. However I
>>>    don't believe this is safe per comment in virtio_ring this API
>>>    is not valid on an active queue and the only thing we have done
>>>    here is napi_disable/napi_enable wrappers which doesn't do anything
>>>    to the emulation layer.
>>>
>>>    So the RFC is really to find the best solution to this problem.
>>>    A couple things come to mind, (a) always allocate the necessary
>>>    headroom but this is a bit of a waste (b) add some bit somewhere
>>>    to check if the buffer has headroom but this would mean XDP programs
>>>    would be broke for a cycle through the ring, (c) figure out how
>>>    to deactivate a queue, free the buffers and finally reallocate.
>>>    I think (c) is the best choice for now but I'm not seeing the
>>>    API to do this so virtio/qemu experts anyone know off-hand
>>>    how to make this work? I started looking into the PCI callbacks
>>>    reset() and virtio_device_ready() or possibly hitting the right
>>>    set of bits with vp_set_status() but my first attempt just hung
>>>    the device.
>>
>> Hi John:
>>
>> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
>> queue_enable field in pci common cfg.
> 
> In fact 1.0 only allows enabling queues selectively.
> We can add disabling by a spec enhancement but
> for now reset is the only way.
> 
> 
>> But unfortunately, qemu does not
>> emulate this at all and legacy device does not even support this. So the
>> safe way is probably reset the device and redo the initialization here.
> 
> You will also have to re-apply rx filtering if you do this.
> Probably sending notification uplink.
> 

The following seems to hang the device on the next virtnet_send_command()
I expected this to meet the reset requirements from the spec because I
believe its the same flow coming out of restore(). For a real patch we
don't actually need to kfree all the structs and reallocate them but
I was expecting the below to work. Any ideas/hints?

static int virtnet_xdp_reset(struct virtnet_info *vi)
{
        int i, ret;

        netif_device_detach(vi->dev);
        cancel_delayed_work_sync(&vi->refill);
        if (netif_running(vi->dev)) {
                for (i = 0; i < vi->max_queue_pairs; i++)
                        napi_disable(&vi->rq[i].napi);
        }

        remove_vq_common(vi, false);
        ret = init_vqs(vi);
        if (ret)
                return ret;
        virtio_device_ready(vi->vdev);

        if (netif_running(vi->dev)) {
                for (i = 0; i < vi->curr_queue_pairs; i++)
                        if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
                                schedule_delayed_work(&vi->refill, 0);

                for (i = 0; i < vi->max_queue_pairs; i++)
                        virtnet_napi_enable(&vi->rq[i]);
        }
        netif_device_attach(vi->dev);
        return 0;
}

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-05 22:57     ` John Fastabend
@ 2017-01-06  0:39       ` Michael S. Tsirkin
  2017-01-06  3:28         ` John Fastabend
  0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2017-01-06  0:39 UTC (permalink / raw)
  To: John Fastabend
  Cc: Jason Wang, john.r.fastabend, netdev, alexei.starovoitov, daniel

On Thu, Jan 05, 2017 at 02:57:23PM -0800, John Fastabend wrote:
> On 17-01-03 02:16 PM, Michael S. Tsirkin wrote:
> > On Tue, Jan 03, 2017 at 02:01:27PM +0800, Jason Wang wrote:
> >>
> >>
> >> On 2017年01月03日 03:44, John Fastabend wrote:
> >>> Add support for XDP adjust head by allocating a 256B header region
> >>> that XDP programs can grow into. This is only enabled when a XDP
> >>> program is loaded.
> >>>
> >>> In order to ensure that we do not have to unwind queue headroom push
> >>> queue setup below bpf_prog_add. It reads better to do a prog ref
> >>> unwind vs another queue setup call.
> >>>
> >>> : There is a problem with this patch as is. When xdp prog is loaded
> >>>    the old buffers without the 256B headers need to be flushed so that
> >>>    the bpf prog has the necessary headroom. This patch does this by
> >>>    calling the virtqueue_detach_unused_buf() and followed by the
> >>>    virtnet_set_queues() call to reinitialize the buffers. However I
> >>>    don't believe this is safe per comment in virtio_ring this API
> >>>    is not valid on an active queue and the only thing we have done
> >>>    here is napi_disable/napi_enable wrappers which doesn't do anything
> >>>    to the emulation layer.
> >>>
> >>>    So the RFC is really to find the best solution to this problem.
> >>>    A couple things come to mind, (a) always allocate the necessary
> >>>    headroom but this is a bit of a waste (b) add some bit somewhere
> >>>    to check if the buffer has headroom but this would mean XDP programs
> >>>    would be broke for a cycle through the ring, (c) figure out how
> >>>    to deactivate a queue, free the buffers and finally reallocate.
> >>>    I think (c) is the best choice for now but I'm not seeing the
> >>>    API to do this so virtio/qemu experts anyone know off-hand
> >>>    how to make this work? I started looking into the PCI callbacks
> >>>    reset() and virtio_device_ready() or possibly hitting the right
> >>>    set of bits with vp_set_status() but my first attempt just hung
> >>>    the device.
> >>
> >> Hi John:
> >>
> >> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
> >> queue_enable field in pci common cfg.
> > 
> > In fact 1.0 only allows enabling queues selectively.
> > We can add disabling by a spec enhancement but
> > for now reset is the only way.
> > 
> > 
> >> But unfortunately, qemu does not
> >> emulate this at all and legacy device does not even support this. So the
> >> safe way is probably reset the device and redo the initialization here.
> > 
> > You will also have to re-apply rx filtering if you do this.
> > Probably sending notification uplink.
> > 
> 
> The following seems to hang the device on the next virtnet_send_command()
> I expected this to meet the reset requirements from the spec because I
> believe its the same flow coming out of restore(). For a real patch we
> don't actually need to kfree all the structs and reallocate them but
> I was expecting the below to work. Any ideas/hints?

Restore assumes device was previously reset.
You want to combine freeze+restore.

> static int virtnet_xdp_reset(struct virtnet_info *vi)
> {
>         int i, ret;
> 
>         netif_device_detach(vi->dev);
>         cancel_delayed_work_sync(&vi->refill);
>         if (netif_running(vi->dev)) {
>                 for (i = 0; i < vi->max_queue_pairs; i++)
>                         napi_disable(&vi->rq[i].napi);
>         }
> 
>         remove_vq_common(vi, false);
>         ret = init_vqs(vi);
>         if (ret)
>                 return ret;
>         virtio_device_ready(vi->vdev);
> 
>         if (netif_running(vi->dev)) {
>                 for (i = 0; i < vi->curr_queue_pairs; i++)
>                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
>                                 schedule_delayed_work(&vi->refill, 0);
> 
>                 for (i = 0; i < vi->max_queue_pairs; i++)
>                         virtnet_napi_enable(&vi->rq[i]);
>         }
>         netif_device_attach(vi->dev);
>         return 0;
> }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-06  0:39       ` Michael S. Tsirkin
@ 2017-01-06  3:28         ` John Fastabend
  0 siblings, 0 replies; 15+ messages in thread
From: John Fastabend @ 2017-01-06  3:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, john.r.fastabend, netdev, alexei.starovoitov, daniel

On 17-01-05 04:39 PM, Michael S. Tsirkin wrote:
> On Thu, Jan 05, 2017 at 02:57:23PM -0800, John Fastabend wrote:
>> On 17-01-03 02:16 PM, Michael S. Tsirkin wrote:
>>> On Tue, Jan 03, 2017 at 02:01:27PM +0800, Jason Wang wrote:
>>>>
>>>>
>>>> On 2017年01月03日 03:44, John Fastabend wrote:
>>>>> Add support for XDP adjust head by allocating a 256B header region
>>>>> that XDP programs can grow into. This is only enabled when a XDP
>>>>> program is loaded.
>>>>>
>>>>> In order to ensure that we do not have to unwind queue headroom push
>>>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>>>> unwind vs another queue setup call.
>>>>>
>>>>> : There is a problem with this patch as is. When xdp prog is loaded
>>>>>    the old buffers without the 256B headers need to be flushed so that
>>>>>    the bpf prog has the necessary headroom. This patch does this by
>>>>>    calling the virtqueue_detach_unused_buf() and followed by the
>>>>>    virtnet_set_queues() call to reinitialize the buffers. However I
>>>>>    don't believe this is safe per comment in virtio_ring this API
>>>>>    is not valid on an active queue and the only thing we have done
>>>>>    here is napi_disable/napi_enable wrappers which doesn't do anything
>>>>>    to the emulation layer.
>>>>>
>>>>>    So the RFC is really to find the best solution to this problem.
>>>>>    A couple things come to mind, (a) always allocate the necessary
>>>>>    headroom but this is a bit of a waste (b) add some bit somewhere
>>>>>    to check if the buffer has headroom but this would mean XDP programs
>>>>>    would be broke for a cycle through the ring, (c) figure out how
>>>>>    to deactivate a queue, free the buffers and finally reallocate.
>>>>>    I think (c) is the best choice for now but I'm not seeing the
>>>>>    API to do this so virtio/qemu experts anyone know off-hand
>>>>>    how to make this work? I started looking into the PCI callbacks
>>>>>    reset() and virtio_device_ready() or possibly hitting the right
>>>>>    set of bits with vp_set_status() but my first attempt just hung
>>>>>    the device.
>>>>
>>>> Hi John:
>>>>
>>>> AFAIK, disabling a specific queue was supported only by virtio 1.0 through
>>>> queue_enable field in pci common cfg.
>>>
>>> In fact 1.0 only allows enabling queues selectively.
>>> We can add disabling by a spec enhancement but
>>> for now reset is the only way.
>>>
>>>
>>>> But unfortunately, qemu does not
>>>> emulate this at all and legacy device does not even support this. So the
>>>> safe way is probably reset the device and redo the initialization here.
>>>
>>> You will also have to re-apply rx filtering if you do this.
>>> Probably sending notification uplink.
>>>
>>
>> The following seems to hang the device on the next virtnet_send_command()
>> I expected this to meet the reset requirements from the spec because I
>> believe its the same flow coming out of restore(). For a real patch we
>> don't actually need to kfree all the structs and reallocate them but
>> I was expecting the below to work. Any ideas/hints?
> 
> Restore assumes device was previously reset.
> You want to combine freeze+restore.

Yep the below is actually freeze+restore I just omitted the freeze portion
from the description.

> 
>> static int virtnet_xdp_reset(struct virtnet_info *vi)
>> {
>>         int i, ret;
>>

//  insert flush_work here doesn't seem to help hang.

>>         netif_device_detach(vi->dev);
>>         cancel_delayed_work_sync(&vi->refill);
>>         if (netif_running(vi->dev)) {
>>                 for (i = 0; i < vi->max_queue_pairs; i++)
>>                         napi_disable(&vi->rq[i].napi);
>>         }
>>
>>         remove_vq_common(vi, false);

// everything above is freeze sans flush_work and virtnet_cpu_notif_remove
// the rest below is restore where I call virtnet_set_queues later outside
// the reset function.

>>         ret = init_vqs(vi);
>>         if (ret)
>>                 return ret;
>>         virtio_device_ready(vi->vdev);
>>
>>         if (netif_running(vi->dev)) {
>>                 for (i = 0; i < vi->curr_queue_pairs; i++)
>>                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
>>                                 schedule_delayed_work(&vi->refill, 0);
>>
>>                 for (i = 0; i < vi->max_queue_pairs; i++)
>>                         virtnet_napi_enable(&vi->rq[i]);
>>         }
>>         netif_device_attach(vi->dev);
>>         return 0;
>> }

Could be a locking problem I'm missing so I'll look at it a bit more but
good to know we expect freeze/restore to reset the device.

.John

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-03  6:01 ` Jason Wang
  2017-01-03 16:54   ` John Fastabend
  2017-01-03 22:16   ` Michael S. Tsirkin
@ 2017-01-04 18:58   ` John Fastabend
  2017-01-05  3:10     ` Jason Wang
  2 siblings, 1 reply; 15+ messages in thread
From: John Fastabend @ 2017-01-04 18:58 UTC (permalink / raw)
  To: Jason Wang, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel

[...]

>> @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>>                  struct bpf_prog *xdp_prog,
>>                  void *data, int len)
>>   {
>> -    int hdr_padded_len;
>>       struct xdp_buff xdp;
>> -    void *buf;
>>       unsigned int qp;
>>       u32 act;
>>   +
>>       if (vi->mergeable_rx_bufs) {
>> -        hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>> -        xdp.data = data + hdr_padded_len;
>> +        int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>> +
>> +        /* Allow consuming headroom but reserve enough space to push
>> +         * the descriptor on if we get an XDP_TX return code.
>> +         */
>> +        xdp.data_hard_start = data - vi->headroom + desc_room;
>> +        xdp.data = data + desc_room;
>>           xdp.data_end = xdp.data + (len - vi->hdr_len);
>> -        buf = data;
>>       } else { /* small buffers */
>>           struct sk_buff *skb = data;
>>   -        xdp.data = skb->data;
>> +        xdp.data_hard_start = skb->data;
>> +        xdp.data = skb->data + vi->headroom;
>>           xdp.data_end = xdp.data + len;
>> -        buf = skb->data;
>>       }
>>         act = bpf_prog_run_xdp(xdp_prog, &xdp);
>>       switch (act) {
>>       case XDP_PASS:
>> +        if (!vi->mergeable_rx_bufs)
>> +            __skb_pull((struct sk_buff *) data,
>> +                   xdp.data - xdp.data_hard_start);
> 
> Instead of doing things here and virtnet_xdp_xmit(). How about always making
> skb->data point to the buffer head like:
> 
> 1) reserve headroom in add_recvbuf_small()
> 2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data was
> modified afer bpf_prog_run_xdp()
> 
> Then there's no special code in either XDP_PASS or XDP_TX?
> 

Alternatively moving the pull into the receive_small XDP handler also
removes the special case. I'll submit a patch shortly let me know what
you think.

>>           return XDP_PASS;
>>       case XDP_TX:
>>           qp = vi->curr_queue_pairs -
>>               vi->xdp_queue_pairs +
>>               smp_processor_id();
> 
> [...]
> 

.John

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2017-01-04 18:58   ` John Fastabend
@ 2017-01-05  3:10     ` Jason Wang
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Wang @ 2017-01-05  3:10 UTC (permalink / raw)
  To: John Fastabend, mst; +Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel



On 2017年01月05日 02:58, John Fastabend wrote:
> [...]
>
>>> @@ -393,34 +397,39 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>>>                   struct bpf_prog *xdp_prog,
>>>                   void *data, int len)
>>>    {
>>> -    int hdr_padded_len;
>>>        struct xdp_buff xdp;
>>> -    void *buf;
>>>        unsigned int qp;
>>>        u32 act;
>>>    +
>>>        if (vi->mergeable_rx_bufs) {
>>> -        hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>> -        xdp.data = data + hdr_padded_len;
>>> +        int desc_room = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>>> +
>>> +        /* Allow consuming headroom but reserve enough space to push
>>> +         * the descriptor on if we get an XDP_TX return code.
>>> +         */
>>> +        xdp.data_hard_start = data - vi->headroom + desc_room;
>>> +        xdp.data = data + desc_room;
>>>            xdp.data_end = xdp.data + (len - vi->hdr_len);
>>> -        buf = data;
>>>        } else { /* small buffers */
>>>            struct sk_buff *skb = data;
>>>    -        xdp.data = skb->data;
>>> +        xdp.data_hard_start = skb->data;
>>> +        xdp.data = skb->data + vi->headroom;
>>>            xdp.data_end = xdp.data + len;
>>> -        buf = skb->data;
>>>        }
>>>          act = bpf_prog_run_xdp(xdp_prog, &xdp);
>>>        switch (act) {
>>>        case XDP_PASS:
>>> +        if (!vi->mergeable_rx_bufs)
>>> +            __skb_pull((struct sk_buff *) data,
>>> +                   xdp.data - xdp.data_hard_start);
>> Instead of doing things here and virtnet_xdp_xmit(). How about always making
>> skb->data point to the buffer head like:
>>
>> 1) reserve headroom in add_recvbuf_small()
>> 2) skb_push(xdp->data - xdp_data_hard_start, skb) if we detect xdp->data was
>> modified afer bpf_prog_run_xdp()
>>
>> Then there's no special code in either XDP_PASS or XDP_TX?
>>
> Alternatively moving the pull into the receive_small XDP handler also
> removes the special case. I'll submit a patch shortly let me know what
> you think.
>

I'm ok with this.

Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC PATCH] virtio_net: XDP support for adjust_head
@ 2016-12-23 18:43 John Fastabend
  2016-12-23 21:46 ` John Fastabend
  0 siblings, 1 reply; 15+ messages in thread
From: John Fastabend @ 2016-12-23 18:43 UTC (permalink / raw)
  To: jasowang, jakub.kicinski, mst
  Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel,
	virtualization

Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

TBD merge with Jason Wang's fixes, do a bit more testing, note
    big_packet support is removed by Jason as well.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/virtio_net.c |   53 +++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 08327e0..1a93158 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -159,6 +159,9 @@ struct virtnet_info {
 	/* Ethtool settings */
 	u8 duplex;
 	u32 speed;
+
+	/* Headroom allocated in RX Queue */
+	unsigned int headroom;
 };
 
 struct padded_vnet_hdr {
@@ -392,6 +395,7 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 	else
 		hdr_padded_len = sizeof(struct padded_vnet_hdr);
 
+	xdp.data_hard_start = buf - vi->headroom;
 	xdp.data = buf + hdr_padded_len;
 	xdp.data_end = xdp.data + (len - vi->hdr_len);
 
@@ -430,9 +434,12 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
-	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
+	struct bpf_prog *xdp_prog;
 	struct sk_buff *skb;
+	int offset;
+
+	offset = vi->headroom;
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -442,7 +449,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 
 		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 			goto err_xdp;
-		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
+		act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
 		switch (act) {
 		case XDP_PASS:
 			break;
@@ -456,7 +463,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 	}
 	rcu_read_unlock();
 
-	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	skb = page_to_skb(vi, rq, page, offset, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
@@ -797,10 +804,10 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 
 	/* rq->sg[0], rq->sg[1] share the same page */
 	/* a separated rq->sg[0] for header - required in case !any_header_sg */
-	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
+	sg_set_buf(&rq->sg[0], p, vi->hdr_len + vi->headroom);
 
 	/* rq->sg[1] for data packet, from offset */
-	offset = sizeof(struct padded_vnet_hdr);
+	offset = vi->headroom + sizeof(struct padded_vnet_hdr);
 	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
@@ -823,24 +830,27 @@ static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
 	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
 }
 
-static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi,
+				 struct receive_queue *rq, gfp_t gfp)
 {
 	struct page_frag *alloc_frag = &rq->alloc_frag;
+	unsigned int headroom = vi->headroom;
 	char *buf;
 	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
 	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
-	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
 		return -ENOMEM;
 
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	buf += headroom; /* advance address leaving hole at front of pkt */
 	ctx = mergeable_buf_to_ctx(buf, len);
 	get_page(alloc_frag->page);
-	alloc_frag->offset += len;
+	alloc_frag->offset += len + headroom;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < len) {
+	if (hole < len + headroom) {
 		/* To avoid internal fragmentation, if there is very likely not
 		 * enough space for another buffer, add the remaining space to
 		 * the current buffer. This extra space is not included in
@@ -874,7 +884,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(rq, gfp);
+			err = add_recvbuf_mergeable(vi, rq, gfp);
 		else if (vi->big_packets)
 			err = add_recvbuf_big(vi, rq, gfp);
 		else
@@ -1669,12 +1679,15 @@ static void virtnet_init_settings(struct net_device *dev)
 	.set_settings = virtnet_set_settings,
 };
 
+#define VIRTIO_XDP_HEADROOM 256
+
 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct bpf_prog *old_prog;
 	u16 xdp_qp = 0, curr_qp;
+	unsigned int old_hr;
 	int i, err;
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1704,19 +1717,25 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 		return -ENOMEM;
 	}
 
+	old_hr = vi->headroom;
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+		vi->headroom = VIRTIO_XDP_HEADROOM;
+	} else {
+		vi->headroom = 0;
+	}
+
 	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
 	if (err) {
 		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
+		vi->headroom = old_hr;
+		if (prog)
+			bpf_prog_sub(prog, vi->max_queue_pairs - 1);
 		return err;
 	}
 
-	if (prog) {
-		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-		if (IS_ERR(prog)) {
-			virtnet_set_queues(vi, curr_qp);
-			return PTR_ERR(prog);
-		}
-	}
 
 	vi->xdp_queue_pairs = xdp_qp;
 	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] virtio_net: XDP support for adjust_head
  2016-12-23 18:43 John Fastabend
@ 2016-12-23 21:46 ` John Fastabend
  0 siblings, 0 replies; 15+ messages in thread
From: John Fastabend @ 2016-12-23 21:46 UTC (permalink / raw)
  To: jasowang, jakub.kicinski, mst
  Cc: john.r.fastabend, netdev, alexei.starovoitov, daniel,
	virtualization

On 16-12-23 10:43 AM, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
> 
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
> 
> TBD merge with Jason Wang's fixes, do a bit more testing, note
>     big_packet support is removed by Jason as well.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
>  drivers/net/virtio_net.c |   53 +++++++++++++++++++++++++++++++---------------
>  1 file changed, 36 insertions(+), 17 deletions(-)
> 

ah dang, I sent this from the wrong tree so it was missing a fix sorry
for the noise. I'll send an update on top of net shortly.

.John

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2017-01-06  3:28 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-01-02 19:44 [RFC PATCH] virtio_net: XDP support for adjust_head John Fastabend
2017-01-02 19:47 ` John Fastabend
2017-01-03  6:01 ` Jason Wang
2017-01-03 16:54   ` John Fastabend
2017-01-03 16:57     ` John Fastabend
2017-01-04  3:22       ` Jason Wang
2017-01-04  3:21     ` Jason Wang
2017-01-03 22:16   ` Michael S. Tsirkin
2017-01-05 22:57     ` John Fastabend
2017-01-06  0:39       ` Michael S. Tsirkin
2017-01-06  3:28         ` John Fastabend
2017-01-04 18:58   ` John Fastabend
2017-01-05  3:10     ` Jason Wang
  -- strict thread matches above, loose matches on Subject: below --
2016-12-23 18:43 John Fastabend
2016-12-23 21:46 ` John Fastabend

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).