Re: [PATCH v4 bpf-next 2/3] veth: rework veth_xdp_rcv_skb in order to accept non-linear skb

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Toke Høiland-Jørgensen" <toke@redhat.com>
To: Lorenzo Bianconi <lorenzo@kernel.org>,
	bpf@vger.kernel.org, netdev@vger.kernel.org
Cc: davem@davemloft.net, kuba@kernel.org, ast@kernel.org,
	daniel@iogearbox.net, brouer@redhat.com, pabeni@redhat.com,
	echaudro@redhat.com, lorenzo.bianconi@redhat.com,
	toshiaki.makita1@gmail.com, andrii@kernel.org
Subject: Re: [PATCH v4 bpf-next 2/3] veth: rework veth_xdp_rcv_skb in order to accept non-linear skb
Date: Thu, 10 Mar 2022 12:21:39 +0100	[thread overview]
Message-ID: <87ee3auk70.fsf@toke.dk> (raw)
In-Reply-To: <24703dbc3477a4b3aaf908f6226a566d27969f83.1646755129.git.lorenzo@kernel.org>

Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Introduce veth_convert_xdp_buff_from_skb routine in order to
> convert a non-linear skb into a xdp buffer. If the received skb
> is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> in a new skb composed by order-0 pages for the linear and the
> fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> we have enough headroom for xdp.
> This is a preliminary patch to allow attaching xdp programs with frags
> support on veth devices.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

It's cool that we can do this! A few comments below:

> ---
>  drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++---------------
>  net/core/xdp.c     |   1 +
>  2 files changed, 119 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index b77ce3fdcfe8..47b21b1d2fd9 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -433,21 +433,6 @@ static void veth_set_multicast_list(struct net_device *dev)
>  {
>  }
>  
> -static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
> -				      int buflen)
> -{
> -	struct sk_buff *skb;
> -
> -	skb = build_skb(head, buflen);
> -	if (!skb)
> -		return NULL;
> -
> -	skb_reserve(skb, headroom);
> -	skb_put(skb, len);
> -
> -	return skb;
> -}
> -
>  static int veth_select_rxq(struct net_device *dev)
>  {
>  	return smp_processor_id() % dev->real_num_rx_queues;
> @@ -695,72 +680,143 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
>  	}
>  }
>  
> -static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> -					struct sk_buff *skb,
> -					struct veth_xdp_tx_bq *bq,
> -					struct veth_stats *stats)
> +static void veth_xdp_get(struct xdp_buff *xdp)
>  {
> -	u32 pktlen, headroom, act, metalen, frame_sz;
> -	void *orig_data, *orig_data_end;
> -	struct bpf_prog *xdp_prog;
> -	int mac_len, delta, off;
> -	struct xdp_buff xdp;
> +	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> +	int i;
>  
> -	skb_prepare_for_gro(skb);
> +	get_page(virt_to_page(xdp->data));
> +	if (likely(!xdp_buff_has_frags(xdp)))
> +		return;
>  
> -	rcu_read_lock();
> -	xdp_prog = rcu_dereference(rq->xdp_prog);
> -	if (unlikely(!xdp_prog)) {
> -		rcu_read_unlock();
> -		goto out;
> -	}
> +	for (i = 0; i < sinfo->nr_frags; i++)
> +		__skb_frag_ref(&sinfo->frags[i]);
> +}
>  
> -	mac_len = skb->data - skb_mac_header(skb);
> -	pktlen = skb->len + mac_len;
> -	headroom = skb_headroom(skb) - mac_len;
> +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> +					  struct xdp_buff *xdp,
> +					  struct sk_buff **pskb)
> +{

nit: It's not really "converting" and skb into an xdp_buff, since the
xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?

> +	struct sk_buff *skb = *pskb;
> +	u32 frame_sz;
>  
>  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> +	    skb_shinfo(skb)->nr_frags) {

So this always clones the skb if it has frags? Is that really needed?

Also, there's a lot of memory allocation and copying going on here; have
you measured the performance?

> +		u32 size, len, max_head_size, off;
>  		struct sk_buff *nskb;
> -		int size, head_off;
> -		void *head, *start;
>  		struct page *page;
> +		int i, head_off;
>  
> -		size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
> -		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -		if (size > PAGE_SIZE)
> +		/* We need a private copy of the skb and data buffers since
> +		 * the ebpf program can modify it. We segment the original skb
> +		 * into order-0 pages without linearize it.
> +		 *
> +		 * Make sure we have enough space for linear and paged area
> +		 */
> +		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
> +						  VETH_XDP_HEADROOM);
> +		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
>  			goto drop;
>  
> +		/* Allocate skb head */
>  		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
>  		if (!page)
>  			goto drop;
>  
> -		head = page_address(page);
> -		start = head + VETH_XDP_HEADROOM;
> -		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
> -			page_frag_free(head);
> +		nskb = build_skb(page_address(page), PAGE_SIZE);
> +		if (!nskb) {
> +			put_page(page);
>  			goto drop;
>  		}
>  
> -		nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
> -				      skb->len, PAGE_SIZE);
> -		if (!nskb) {
> -			page_frag_free(head);
> +		skb_reserve(nskb, VETH_XDP_HEADROOM);
> +		size = min_t(u32, skb->len, max_head_size);
> +		if (skb_copy_bits(skb, 0, nskb->data, size)) {
> +			consume_skb(nskb);
>  			goto drop;
>  		}
> +		skb_put(nskb, size);
>  
>  		skb_copy_header(nskb, skb);
>  		head_off = skb_headroom(nskb) - skb_headroom(skb);
>  		skb_headers_offset_update(nskb, head_off);
> +
> +		/* Allocate paged area of new skb */
> +		off = size;
> +		len = skb->len - off;
> +
> +		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> +			page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
> +			if (!page) {
> +				consume_skb(nskb);
> +				goto drop;
> +			}
> +
> +			size = min_t(u32, len, PAGE_SIZE);
> +			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
> +			if (skb_copy_bits(skb, off, page_address(page),
> +					  size)) {
> +				consume_skb(nskb);
> +				goto drop;
> +			}
> +
> +			len -= size;
> +			off += size;
> +		}
> +
>  		consume_skb(skb);
>  		skb = nskb;
> +	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> +		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> +		goto drop;
>  	}
>  
>  	/* SKB "head" area always have tailroom for skb_shared_info */
>  	frame_sz = skb_end_pointer(skb) - skb->head;
>  	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -	xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq);
> -	xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true);
> +	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
> +	xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
> +			 skb_headlen(skb), true);
> +
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
> +	*pskb = skb;
> +
> +	return 0;
> +drop:
> +	consume_skb(skb);
> +	*pskb = NULL;
> +
> +	return -ENOMEM;
> +}
> +
> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> +					struct sk_buff *skb,
> +					struct veth_xdp_tx_bq *bq,
> +					struct veth_stats *stats)
> +{
> +	void *orig_data, *orig_data_end;
> +	struct bpf_prog *xdp_prog;
> +	struct xdp_buff xdp;
> +	u32 act, metalen;
> +	int off;
> +
> +	skb_prepare_for_gro(skb);
> +
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (unlikely(!xdp_prog)) {
> +		rcu_read_unlock();
> +		goto out;
> +	}
> +
> +	__skb_push(skb, skb->data - skb_mac_header(skb));
> +	if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb))
> +		goto drop;
>  
>  	orig_data = xdp.data;
>  	orig_data_end = xdp.data_end;
> @@ -771,7 +827,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  	case XDP_PASS:
>  		break;
>  	case XDP_TX:
> -		get_page(virt_to_page(xdp.data));
> +		veth_xdp_get(&xdp);
>  		consume_skb(skb);
>  		xdp.rxq->mem = rq->xdp_mem;
>  		if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
> @@ -783,7 +839,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  		rcu_read_unlock();
>  		goto xdp_xmit;
>  	case XDP_REDIRECT:
> -		get_page(virt_to_page(xdp.data));
> +		veth_xdp_get(&xdp);
>  		consume_skb(skb);
>  		xdp.rxq->mem = rq->xdp_mem;
>  		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
> @@ -806,18 +862,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  	rcu_read_unlock();
>  
>  	/* check if bpf_xdp_adjust_head was used */
> -	delta = orig_data - xdp.data;
> -	off = mac_len + delta;
> +	off = orig_data - xdp.data;
>  	if (off > 0)
>  		__skb_push(skb, off);
>  	else if (off < 0)
>  		__skb_pull(skb, -off);
> -	skb->mac_header -= delta;
> +
> +	skb_reset_mac_header(skb);
>  
>  	/* check if bpf_xdp_adjust_tail was used */
>  	off = xdp.data_end - orig_data_end;
>  	if (off != 0)
>  		__skb_put(skb, off); /* positive on grow, negative on shrink */
> +
> +	if (xdp_buff_has_frags(&xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;

We can remove entire frags using xdp_adjust_tail, right? Will that get
propagated in the right way to the skb frags due to the dual use of
skb_shared_info, or?

next prev parent reply	other threads:[~2022-03-10 11:21 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-08 16:05 [PATCH v4 bpf-next 0/3] introduce xdp frags support to veth driver Lorenzo Bianconi
2022-03-08 16:05 ` [PATCH v4 bpf-next 1/3] net: veth: account total xdp_frame len running ndo_xdp_xmit Lorenzo Bianconi
2022-03-10 11:10   ` Toke Høiland-Jørgensen
2022-03-15  5:32   ` John Fastabend
2022-03-08 16:05 ` [PATCH v4 bpf-next 2/3] veth: rework veth_xdp_rcv_skb in order to accept non-linear skb Lorenzo Bianconi
2022-03-10 11:21   ` Toke Høiland-Jørgensen [this message]
2022-03-10 11:43     ` Lorenzo Bianconi
2022-03-10 19:06       ` Toke Høiland-Jørgensen
2022-03-10 19:26         ` Lorenzo Bianconi
2022-03-10 23:46           ` Lorenzo Bianconi
2022-03-12 21:18         ` Jakub Kicinski
2022-03-08 16:06 ` [PATCH v4 bpf-next 3/3] veth: allow jumbo frames in xdp mode Lorenzo Bianconi
2022-03-10 11:30   ` Toke Høiland-Jørgensen
2022-03-10 15:06     ` Lorenzo Bianconi
2022-03-10 18:53       ` Toke Høiland-Jørgensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87ee3auk70.fsf@toke.dk \
    --to=toke@redhat.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=brouer@redhat.com \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=echaudro@redhat.com \
    --cc=kuba@kernel.org \
    --cc=lorenzo.bianconi@redhat.com \
    --cc=lorenzo@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=toshiaki.makita1@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.