From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jason Wang Subject: Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head Date: Fri, 13 Jan 2017 15:41:41 +0800 Message-ID: <7c41d9b0-aa77-1485-4890-177ca7b01e50@redhat.com> References: <20170113024908.4535.8835.stgit@john-Precision-Tower-5810> <20170113025206.4535.28363.stgit@john-Precision-Tower-5810> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 8bit Cc: john.r.fastabend@intel.com, netdev@vger.kernel.org, alexei.starovoitov@gmail.com, daniel@iogearbox.net To: John Fastabend , mst@redhat.com Return-path: Received: from mx1.redhat.com ([209.132.183.28]:50536 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751021AbdAMHvW (ORCPT ); Fri, 13 Jan 2017 02:51:22 -0500 In-Reply-To: <20170113025206.4535.28363.stgit@john-Precision-Tower-5810> Sender: netdev-owner@vger.kernel.org List-ID: On 2017年01月13日 10:52, John Fastabend wrote: > Add support for XDP adjust head by allocating a 256B header region > that XDP programs can grow into. This is only enabled when a XDP > program is loaded. > > In order to ensure that we do not have to unwind queue headroom push > queue setup below bpf_prog_add. It reads better to do a prog ref > unwind vs another queue setup call. > > At the moment this code must do a full reset to ensure old buffers > without headroom on program add or with headroom on program removal > are not used incorrectly in the datapath. Ideally we would only > have to disable/enable the RX queues being updated but there is no > API to do this at the moment in virtio so use the big hammer. In > practice it is likely not that big of a problem as this will only > happen when XDP is enabled/disabled changing programs does not > require the reset. There is some risk that the driver may either > have an allocation failure or for some reason fail to correctly > negotiate with the underlying backend in this case the driver will > be left uninitialized. I have not seen this ever happen on my test > systems and for what its worth this same failure case can occur > from probe and other contexts in virtio framework. > > Signed-off-by: John Fastabend > --- > drivers/net/virtio_net.c | 155 ++++++++++++++++++++++++++++++++++++++++------ > drivers/virtio/virtio.c | 9 ++- > include/linux/virtio.h | 3 + > 3 files changed, 144 insertions(+), 23 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 6041828..8b897e7 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > #include > > static int napi_weight = NAPI_POLL_WEIGHT; > @@ -159,6 +160,9 @@ struct virtnet_info { > /* Ethtool settings */ > u8 duplex; > u32 speed; > + > + /* Headroom allocated in RX Queue */ > + unsigned int headroom; If this could not be changed in anyway, better use a macro instead of a filed here. And there's even no need to add an extra parameter to add_recvbuf_mergeable(). > }; > > struct padded_vnet_hdr { > @@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, > } > > if (vi->mergeable_rx_bufs) { > + xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf); Fail to understand why this is needed. We should have excluded vnet header from xdp->data even before bpf_prog_run_xdp(). > /* Zero header and leave csum up to XDP layers */ > hdr = xdp->data; > memset(hdr, 0, vi->hdr_len); > @@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, > num_sg = 2; > sg_init_table(sq->sg, 2); > sg_set_buf(sq->sg, hdr, vi->hdr_len); > - skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); > + skb_to_sgvec(skb, sq->sg + 1, > + xdp->data - xdp->data_hard_start, > + xdp->data_end - xdp->data); > } > err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, > data, GFP_ATOMIC); > @@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev, > struct bpf_prog *xdp_prog; > > len -= vi->hdr_len; > - skb_trim(skb, len); > > rcu_read_lock(); > xdp_prog = rcu_dereference(rq->xdp_prog); > @@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device *dev, > if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags)) > goto err_xdp; > > - xdp.data = skb->data; > + xdp.data_hard_start = skb->data; > + xdp.data = skb->data + vi->headroom; > xdp.data_end = xdp.data + len; > act = bpf_prog_run_xdp(xdp_prog, &xdp); > switch (act) { > case XDP_PASS: > + /* Recalculate length in case bpf program changed it */ > + len = xdp.data_end - xdp.data; > + __skb_pull(skb, xdp.data - xdp.data_hard_start); How about do this just after bpf_pro_run_xdp() for XDP_TX too? This is more readable and there's no need to change xmit path. > break; > case XDP_TX: > virtnet_xdp_xmit(vi, rq, &xdp, skb); > @@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device *dev, > } > rcu_read_unlock(); > > + skb_trim(skb, len); > return skb; > > err_xdp: > @@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > if (unlikely(hdr->hdr.gso_type)) > goto err_xdp; > > + /* Allow consuming headroom but reserve enough space to push > + * the descriptor on if we get an XDP_TX return code. > + */ > data = page_address(xdp_page) + offset; > + xdp.data_hard_start = data - vi->headroom + desc_room; Two possible issues here: 1) If we want to adjust header after linearizing, we should reserve a room for that page, but I don't see any codes for this. 2) If the header has been adjusted, looks like we need change offset value otherwise, page_to_skb() won't build a correct skb for us for XDP_PASS. > xdp.data = data + desc_room; > xdp.data_end = xdp.data + (len - vi->hdr_len); > act = bpf_prog_run_xdp(xdp_prog, &xdp); > @@ -748,20 +763,21 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, > static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, > gfp_t gfp) > { > + int headroom = GOOD_PACKET_LEN + vi->headroom; > struct sk_buff *skb; > struct virtio_net_hdr_mrg_rxbuf *hdr; > int err; > > - skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp); > + skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp); > if (unlikely(!skb)) > return -ENOMEM; > > - skb_put(skb, GOOD_PACKET_LEN); > + skb_put(skb, headroom); > > hdr = skb_vnet_hdr(skb); > sg_init_table(rq->sg, 2); > sg_set_buf(rq->sg, hdr, vi->hdr_len); > - skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); > + skb_to_sgvec(skb, rq->sg + 1, vi->headroom, skb->len - vi->headroom); > > err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp); > if (err < 0) > @@ -829,24 +845,27 @@ static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len) > return ALIGN(len, MERGEABLE_BUFFER_ALIGN); > } > > -static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) > +static int add_recvbuf_mergeable(struct virtnet_info *vi, > + struct receive_queue *rq, gfp_t gfp) > { > struct page_frag *alloc_frag = &rq->alloc_frag; > + unsigned int headroom = vi->headroom; > char *buf; > unsigned long ctx; > int err; > unsigned int len, hole; > > len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len); > - if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp))) > + if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp))) > return -ENOMEM; > > buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + buf += headroom; /* advance address leaving hole at front of pkt */ > ctx = mergeable_buf_to_ctx(buf, len); > get_page(alloc_frag->page); > - alloc_frag->offset += len; > + alloc_frag->offset += len + headroom; > hole = alloc_frag->size - alloc_frag->offset; > - if (hole < len) { > + if (hole < len + headroom) { > /* To avoid internal fragmentation, if there is very likely not > * enough space for another buffer, add the remaining space to > * the current buffer. This extra space is not included in > @@ -880,7 +899,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, > gfp |= __GFP_COLD; > do { > if (vi->mergeable_rx_bufs) > - err = add_recvbuf_mergeable(rq, gfp); > + err = add_recvbuf_mergeable(vi, rq, gfp); > else if (vi->big_packets) > err = add_recvbuf_big(vi, rq, gfp); > else > @@ -1675,12 +1694,90 @@ static void virtnet_init_settings(struct net_device *dev) > .set_settings = virtnet_set_settings, > }; > > +#define VIRTIO_XDP_HEADROOM 256 > + > +static int init_vqs(struct virtnet_info *vi); > +static void remove_vq_common(struct virtnet_info *vi, bool lock); > + > +/* Reset virtio device with RTNL held this is very similar to the > + * freeze()/restore() logic except we need to ensure locking. It is > + * possible that this routine may fail and leave the driver in a > + * failed state. However assuming the driver negotiated correctly > + * at probe time we _should_ be able to (re)negotiate driver again. > + */ Instead of duplicate codes and export helpers, why not use virtio_device_freeze()/virtio_device_restore()? For rtnl_lock in restore, you can probably avoid it be checking rtnl_is_locked() before? > +static int virtnet_xdp_reset(struct virtnet_info *vi) > +{ > + struct virtio_device *vdev = vi->vdev; > + unsigned int status; > + int i, ret; > + > + /* Disable and unwind rings */ > + virtio_config_disable(vdev); > + vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED; > + > + netif_device_detach(vi->dev); > + cancel_delayed_work_sync(&vi->refill); > + if (netif_running(vi->dev)) { > + for (i = 0; i < vi->max_queue_pairs; i++) > + napi_disable(&vi->rq[i].napi); > + } > + > + remove_vq_common(vi, false); > + > + /* Do a reset per virtio spec recommendation */ > + vdev->config->reset(vdev); > + > + /* Acknowledge that we've seen the device. */ > + status = vdev->config->get_status(vdev); > + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_ACKNOWLEDGE); > + > + /* Notify driver is up and finalize features per specification. The > + * error code from finalize features is checked here but should not > + * fail because we assume features were previously synced successfully. > + */ > + status = vdev->config->get_status(vdev); > + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER); > + ret = virtio_finalize_features(vdev); > + if (ret) { > + netdev_warn(vi->dev, "virtio_finalize_features failed during reset aborting\n"); > + goto err; > + } > + > + ret = init_vqs(vi); > + if (ret) { > + netdev_warn(vi->dev, "init_vqs failed during reset aborting\n"); > + goto err; > + } > + virtio_device_ready(vi->vdev); > + > + if (netif_running(vi->dev)) { > + for (i = 0; i < vi->curr_queue_pairs; i++) > + if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) > + schedule_delayed_work(&vi->refill, 0); > + > + for (i = 0; i < vi->max_queue_pairs; i++) > + virtnet_napi_enable(&vi->rq[i]); > + } > + netif_device_attach(vi->dev); > + /* Finally, tell the device we're all set */ > + status = vdev->config->get_status(vdev); > + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER_OK); > + virtio_config_enable(vdev); > + > + return 0; > +err: > + status = vdev->config->get_status(vdev); > + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_FAILED); > + return ret; > +} [...]