Linux virtualization list
 help / color / mirror / Atom feed
* Re: [PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
From: Michael S. Tsirkin @ 2018-09-06 17:51 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-10-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. If an XDP program is attached, tuntap can run
> XDP program directly. If not, tuntap will build skb and do a fast
> receiving since part of the work has been done by vhost_net.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Is most of the benefit in batched flushing or skipping
indirect calls? Because if it's flushing we can gain
most of it easily by adding an analog of xmit_more.

> ---
>  drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c839a4bdcbd9..069db2e5dd08 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
>  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>  }
>  
> +static int tun_xdp_one(struct tun_struct *tun,
> +		       struct tun_file *tfile,
> +		       struct xdp_buff *xdp, int *flush)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	struct tun_pcpu_stats *stats;
> +	struct bpf_prog *xdp_prog;
> +	struct sk_buff *skb = NULL;
> +	u32 rxhash = 0, act;
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int err = 0;
> +	bool skb_xdp = false;
> +
> +	xdp_prog = rcu_dereference(tun->xdp_prog);
> +	if (xdp_prog) {
> +		if (gso->gso_type) {
> +			skb_xdp = true;
> +			goto build;
> +		}
> +		xdp_set_data_meta_invalid(xdp);
> +		xdp->rxq = &tfile->xdp_rxq;
> +		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
> +		if (err)
> +			goto out;
> +		if (act == XDP_REDIRECT)
> +			*flush = true;
> +		if (act != XDP_PASS)
> +			goto out;
> +	}
> +
> +build:
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
> +		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
> +		kfree_skb(skb);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	skb->protocol = eth_type_trans(skb, tun->dev);
> +	skb_reset_network_header(skb);
> +	skb_probe_transport_header(skb, 0);
> +
> +	if (skb_xdp) {
> +		err = do_xdp_generic(xdp_prog, skb);
> +		if (err != XDP_PASS)
> +			goto out;
> +	}
> +
> +	if (!rcu_dereference(tun->steering_prog))
> +		rxhash = __skb_get_hash_symmetric(skb);
> +
> +	netif_receive_skb(skb);
> +
> +	stats = get_cpu_ptr(tun->pcpu_stats);
> +	u64_stats_update_begin(&stats->syncp);
> +	stats->rx_packets++;
> +	stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&stats->syncp);
> +	put_cpu_ptr(stats);
> +
> +	if (rxhash)
> +		tun_flow_update(tun, rxhash, tfile);
> +
> +out:
> +	return err;
> +}
> +
>  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	struct tun_struct *tun = tun_get(tfile);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
>  
>  	if (!tun)
>  		return -EBADFD;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		int n = ctl->type >> 16;
> +		int flush = 0;
> +
> +		local_bh_disable();
> +		rcu_read_lock();
> +
> +		for (i = 0; i < n; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tun_xdp_one(tun, tfile, xdp, &flush);
> +		}
> +
> +		if (flush)
> +			xdp_do_flush_map();
> +
> +		rcu_read_unlock();
> +		local_bh_enable();
> +
> +		ret = total_len;
> +		goto out;
> +	}
>  
>  	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			   m->msg_flags & MSG_DONTWAIT,
>  			   m->msg_flags & MSG_MORE);
> +out:
>  	tun_put(tun);
>  	return ret;
>  }
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
From: Michael S. Tsirkin @ 2018-09-06 18:00 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-11-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:25PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. Tap will build skb through those XDP buffers.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 71 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 7996ed7cbf18..50eb7bf22225 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
>  #endif
>  };
>  
> +static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int vnet_hdr_len = 0;
> +	struct tap_dev *tap;
> +	struct sk_buff *skb;
> +	int err, depth;
> +
> +	if (q->flags & IFF_VNET_HDR)
> +		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
> +
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto err;
> +	}

So fundamentally why is it called XDP?
We just build and skb, don't we?

> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	skb_set_network_header(skb, ETH_HLEN);
> +	skb_reset_mac_header(skb);
> +	skb->protocol = eth_hdr(skb)->h_proto;
> +
> +	if (vnet_hdr_len) {
> +		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
> +		if (err)
> +			goto err_kfree;
> +	}
> +
> +	skb_probe_transport_header(skb, ETH_HLEN);
> +
> +	/* Move network header to the right position for VLAN tagged packets */
> +	if ((skb->protocol == htons(ETH_P_8021Q) ||
> +	     skb->protocol == htons(ETH_P_8021AD)) &&
> +	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
> +		skb_set_network_header(skb, depth);
> +
> +	rcu_read_lock();
> +	tap = rcu_dereference(q->tap);
> +	if (tap) {
> +		skb->dev = tap->dev;
> +		dev_queue_xmit(skb);
> +	} else {
> +		kfree_skb(skb);
> +	}
> +	rcu_read_unlock();
> +
> +	return 0;
> +
> +err_kfree:
> +	kfree_skb(skb);
> +err:
> +	rcu_read_lock();
> +		tap = rcu_dereference(q->tap);
> +	if (tap && tap->count_tx_dropped)
> +		tap->count_tx_dropped(tap);
> +	rcu_read_unlock();
> +	return err;
> +}
> +
>  static int tap_sendmsg(struct socket *sock, struct msghdr *m,
>  		       size_t total_len)
>  {
>  	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
> +	int i;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		for (i = 0; i < ctl->type >> 16; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tap_get_user_xdp(q, xdp);
> +		}
> +		return 0;
> +	}
>  
>  	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			    m->msg_flags & MSG_DONTWAIT);
> -- 
> 2.17.1

^ permalink raw reply

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-09-07  1:22 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180827170005-mutt-send-email-mst@kernel.org>

On Mon, Aug 27, 2018 at 05:00:40PM +0300, Michael S. Tsirkin wrote:
> Are there still plans to test the performance with vost pmd?
> vhost doesn't seem to show a performance gain ...
> 

I tried some performance tests with vhost PMD. In guest, the
XDP program will return XDP_DROP directly. And in host, testpmd
will do txonly fwd.

When burst size is 1 and packet size is 64 in testpmd and
testpmd needs to iterate 5 Tx queues (but only the first two
queues are enabled) to prepare and inject packets, I got ~12%
performance boost (5.7Mpps -> 6.4Mpps). And if the vhost PMD
is faster (e.g. just need to iterate the first two queues to
prepare and inject packets), then I got similar performance
for both rings (~9.9Mpps) (packed ring's performance can be
lower, because it's more complicated in driver.)

I think packed ring makes vhost PMD faster, but it doesn't make
the driver faster. In packed ring, the ring is simplified, and
the handling of the ring in vhost (device) is also simplified,
but things are not simplified in driver, e.g. although there is
no desc table in the virtqueue anymore, driver still needs to
maintain a private desc state table (which is still managed as
a list in this patch set) to support the out-of-order desc
processing in vhost (device).

I think this patch set is mainly to make the driver have a full
functional support for the packed ring, which makes it possible
to leverage the packed ring feature in vhost (device). But I'm
not sure whether there is any other better idea, I'd like to
hear your thoughts. Thanks!


> 
> On Wed, Jul 11, 2018 at 10:27:06AM +0800, Tiwei Bie wrote:
> > Hello everyone,
> > 
> > This patch set implements packed ring support in virtio driver.
> > 
> > Some functional tests have been done with Jason's
> > packed ring implementation in vhost:
> > 
> > https://lkml.org/lkml/2018/7/3/33
> > 
> > Both of ping and netperf worked as expected.
> > 
> > v1 -> v2:
> > - Use READ_ONCE() to read event off_wrap and flags together (Jason);
> > - Add comments related to ccw (Jason);
> > 
> > RFC (v6) -> v1:
> > - Avoid extra virtio_wmb() in virtqueue_enable_cb_delayed_packed()
> >   when event idx is off (Jason);
> > - Fix bufs calculation in virtqueue_enable_cb_delayed_packed() (Jason);
> > - Test the state of the desc at used_idx instead of last_used_idx
> >   in virtqueue_enable_cb_delayed_packed() (Jason);
> > - Save wrap counter (as part of queue state) in the return value
> >   of virtqueue_enable_cb_prepare_packed();
> > - Refine the packed ring definitions in uapi;
> > - Rebase on the net-next tree;
> > 
> > RFC v5 -> RFC v6:
> > - Avoid tracking addr/len/flags when DMA API isn't used (MST/Jason);
> > - Define wrap counter as bool (Jason);
> > - Use ALIGN() in vring_init_packed() (Jason);
> > - Avoid using pointer to track `next` in detach_buf_packed() (Jason);
> > - Add comments for barriers (Jason);
> > - Don't enable RING_PACKED on ccw for now (noticed by Jason);
> > - Refine the memory barrier in virtqueue_poll();
> > - Add a missing memory barrier in virtqueue_enable_cb_delayed_packed();
> > - Remove the hacks in virtqueue_enable_cb_prepare_packed();
> > 
> > RFC v4 -> RFC v5:
> > - Save DMA addr, etc in desc state (Jason);
> > - Track used wrap counter;
> > 
> > RFC v3 -> RFC v4:
> > - Make ID allocation support out-of-order (Jason);
> > - Various fixes for EVENT_IDX support;
> > 
> > RFC v2 -> RFC v3:
> > - Split into small patches (Jason);
> > - Add helper virtqueue_use_indirect() (Jason);
> > - Just set id for the last descriptor of a list (Jason);
> > - Calculate the prev in virtqueue_add_packed() (Jason);
> > - Fix/improve desc suppression code (Jason/MST);
> > - Refine the code layout for XXX_split/packed and wrappers (MST);
> > - Fix the comments and API in uapi (MST);
> > - Remove the BUG_ON() for indirect (Jason);
> > - Some other refinements and bug fixes;
> > 
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> > 
> > Thanks!
> > 
> > Tiwei Bie (5):
> >   virtio: add packed ring definitions
> >   virtio_ring: support creating packed ring
> >   virtio_ring: add packed ring support
> >   virtio_ring: add event idx support in packed ring
> >   virtio_ring: enable packed ring
> > 
> >  drivers/s390/virtio/virtio_ccw.c   |   14 +
> >  drivers/virtio/virtio_ring.c       | 1365 ++++++++++++++++++++++------
> >  include/linux/virtio_ring.h        |    8 +-
> >  include/uapi/linux/virtio_config.h |    3 +
> >  include/uapi/linux/virtio_ring.h   |   43 +
> >  5 files changed, 1157 insertions(+), 276 deletions(-)
> > 
> > -- 
> > 2.18.0
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
> 

^ permalink raw reply

* Re: [PATCH net-next 01/11] net: sock: introduce SOCK_XDP
From: Jason Wang @ 2018-09-07  3:07 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906125615-mutt-send-email-mst@kernel.org>



On 2018年09月07日 00:56, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:16PM +0800, Jason Wang wrote:
>> This patch introduces a new sock flag - SOCK_XDP. This will be used
>> for notifying the upper layer that XDP program is attached on the
>> lower socket, and requires for extra headroom.
>>
>> TUN will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> In fact vhost is the 1st user, right? So this can be
> pushed out to become patch 10/11.

Better with an independent patch, since patch 10/11 can work without XDP.

Thanks

>
>> ---
>>   drivers/net/tun.c  | 19 +++++++++++++++++++
>>   include/net/sock.h |  1 +
>>   2 files changed, 20 insertions(+)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index ebd07ad82431..2c548bd20393 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>>   		tun_napi_init(tun, tfile, napi);
>>   	}
>>   
>> +	if (rtnl_dereference(tun->xdp_prog))
>> +		sock_set_flag(&tfile->sk, SOCK_XDP);
>> +
>>   	tun_set_real_num_queues(tun);
>>   
>>   	/* device is allowed to go away first, so no need to hold extra
>> @@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>>   		       struct netlink_ext_ack *extack)
>>   {
>>   	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile;
>>   	struct bpf_prog *old_prog;
>> +	int i;
>>   
>>   	old_prog = rtnl_dereference(tun->xdp_prog);
>>   	rcu_assign_pointer(tun->xdp_prog, prog);
>>   	if (old_prog)
>>   		bpf_prog_put(old_prog);
>>   
>> +	for (i = 0; i < tun->numqueues; i++) {
>> +		tfile = rtnl_dereference(tun->tfiles[i]);
>> +		if (prog)
>> +			sock_set_flag(&tfile->sk, SOCK_XDP);
>> +		else
>> +			sock_reset_flag(&tfile->sk, SOCK_XDP);
>> +	}
>> +	list_for_each_entry(tfile, &tun->disabled, next) {
>> +		if (prog)
>> +			sock_set_flag(&tfile->sk, SOCK_XDP);
>> +		else
>> +			sock_reset_flag(&tfile->sk, SOCK_XDP);
>> +	}
>> +
>>   	return 0;
>>   }
>>   
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 433f45fc2d68..38cae35f6e16 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -800,6 +800,7 @@ enum sock_flags {
>>   	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
>>   	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
>>   	SOCK_TXTIME,
>> +	SOCK_XDP, /* XDP is attached */
>>   };
>>   
>>   #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 02/11] tuntap: switch to use XDP_PACKET_HEADROOM
From: Jason Wang @ 2018-09-07  3:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906125718-mutt-send-email-mst@kernel.org>



On 2018年09月07日 00:57, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:17PM +0800, Jason Wang wrote:
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
>
> any idea why didn't we do this straight away?

Dunno, but git grep told me not all XDP capable driver use this (e.g 
virtio_net has its own value).

Anyway, I think it's ok for driver to have their specific value if they 
can make sure the value is equal or greater than XDP_PACKET_HEADROOM.

Thanks

>
>> ---
>>   drivers/net/tun.c | 3 +--
>>   1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 2c548bd20393..d3677a544b56 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -113,7 +113,6 @@ do {								\
>>   } while (0)
>>   #endif
>>   
>> -#define TUN_HEADROOM 256
>>   #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
>>   
>>   /* TUN device flags */
>> @@ -1654,7 +1653,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	rcu_read_lock();
>>   	xdp_prog = rcu_dereference(tun->xdp_prog);
>>   	if (xdp_prog)
>> -		pad += TUN_HEADROOM;
>> +		pad += XDP_PACKET_HEADROOM;
>>   	buflen += SKB_DATA_ALIGN(len + pad);
>>   	rcu_read_unlock();
>>   
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
From: Jason Wang @ 2018-09-07  3:22 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906130425-mutt-send-email-mst@kernel.org>



On 2018年09月07日 01:14, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:19PM +0800, Jason Wang wrote:
>> There's no need to duplicate page get logic in each action. So this
>> patch tries to get page and calculate the offset before processing XDP
>> actions, and undo them when meet errors (we don't care the performance
>> on errors). This will be used for factoring out XDP logic.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> I see some issues with this one.
>
>> ---
>>   drivers/net/tun.c | 37 ++++++++++++++++---------------------
>>   1 file changed, 16 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 372caf7d67d9..f8cdcfa392c3 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1642,7 +1642,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   				     int len, int *skb_xdp)
>>   {
>>   	struct page_frag *alloc_frag = &current->task_frag;
>> -	struct sk_buff *skb;
>> +	struct sk_buff *skb = NULL;
>>   	struct bpf_prog *xdp_prog;
>>   	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>>   	unsigned int delta = 0;
>> @@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	if (copied != len)
>>   		return ERR_PTR(-EFAULT);
>>   
>> +	get_page(alloc_frag->page);
>> +	alloc_frag->offset += buflen;
>> +
> This adds an atomic op on XDP_DROP which is a data path
> operation for some workloads.

Yes, I have patch on top to amortize this, the idea is to have a very 
big refcount once after the frag was allocated and maintain a bias and 
decrease them all when allocating new frags.'

>
>>   	/* There's a small window that XDP may be set after the check
>>   	 * of xdp_prog above, this should be rare and for simplicity
>>   	 * we do XDP on skb in case the headroom is not enough.
>> @@ -1695,23 +1698,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   
>>   		switch (act) {
>>   		case XDP_REDIRECT:
>> -			get_page(alloc_frag->page);
>> -			alloc_frag->offset += buflen;
>>   			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
>>   			xdp_do_flush_map();
>>   			if (err)
>> -				goto err_redirect;
>> -			rcu_read_unlock();
>> -			local_bh_enable();
>> -			return NULL;
>> +				goto err_xdp;
>> +			goto out;
>>   		case XDP_TX:
>> -			get_page(alloc_frag->page);
>> -			alloc_frag->offset += buflen;
>>   			if (tun_xdp_tx(tun->dev, &xdp) < 0)
>> -				goto err_redirect;
>> -			rcu_read_unlock();
>> -			local_bh_enable();
>> -			return NULL;
>> +				goto err_xdp;
>> +			goto out;
>>   		case XDP_PASS:
>>   			delta = orig_data - xdp.data;
>>   			len = xdp.data_end - xdp.data;
>> @@ -1730,23 +1725,23 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	local_bh_enable();
>>   
>>   	skb = build_skb(buf, buflen);
>> -	if (!skb)
>> -		return ERR_PTR(-ENOMEM);
>> +	if (!skb) {
>> +		skb = ERR_PTR(-ENOMEM);
>> +		goto out;
> So goto out will skip put_page, and we did
> do get_page above. Seems wrong. You should
> goto err_skb or something like this.

Yes, looks like err_xdp.

>
>
>> +	}
>>   
>>   	skb_reserve(skb, pad - delta);
>>   	skb_put(skb, len);
>> -	get_page(alloc_frag->page);
>> -	alloc_frag->offset += buflen;
>>   
>>   	return skb;
>>   
>> -err_redirect:
>> -	put_page(alloc_frag->page);
>>   err_xdp:
>> +	alloc_frag->offset -= buflen;
>> +	put_page(alloc_frag->page);
>> +out:
> Out here isn't an error at all, is it?  You should not mix return and
> error handling IMHO.

If you mean the name, I can rename the label to "drop".

>
>
>
>>   	rcu_read_unlock();
>>   	local_bh_enable();
>> -	this_cpu_inc(tun->pcpu_stats->rx_dropped);
> Doesn't this break rx_dropped accounting?

Let me fix this.

Thanks

>> -	return NULL;
>> +	return skb;
>>   }
>>   
>>   /* Get packet from user space buffer */
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 05/11] tuntap: tweak on the path of non-xdp case in tun_build_skb()
From: Jason Wang @ 2018-09-07  3:24 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906131509-mutt-send-email-mst@kernel.org>



On 2018年09月07日 01:16, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:20PM +0800, Jason Wang wrote:
>> If we're sure not to go native XDP, there's no need for several things
>> like bh and rcu stuffs.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> True...
>
>> ---
>>   drivers/net/tun.c | 9 ++++++---
>>   1 file changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index f8cdcfa392c3..389aa0727cc6 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1675,10 +1675,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	 * of xdp_prog above, this should be rare and for simplicity
>>   	 * we do XDP on skb in case the headroom is not enough.
>>   	 */
>> -	if (hdr->gso_type || !xdp_prog)
>> +	if (hdr->gso_type || !xdp_prog) {
>>   		*skb_xdp = 1;
>> -	else
>> -		*skb_xdp = 0;
>> +		goto build;
>> +	}
>> +
>> +	*skb_xdp = 0;
>>   
>>   	local_bh_disable();
>>   	rcu_read_lock();
>> @@ -1724,6 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	rcu_read_unlock();
>>   	local_bh_enable();
>>   
>> +build:
> But this is spaghetti code. Please just put common
> code into functions and call them, don't goto.

Ok, will do this.

Thanks

>
>>   	skb = build_skb(buf, buflen);
>>   	if (!skb) {
>>   		skb = ERR_PTR(-ENOMEM);
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 06/11] tuntap: split out XDP logic
From: Jason Wang @ 2018-09-07  3:29 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906131703-mutt-send-email-mst@kernel.org>



On 2018年09月07日 01:21, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:21PM +0800, Jason Wang wrote:
>> This patch split out XDP logic into a single function. This make it to
>> be reused by XDP batching path in the following patch.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tun.c | 84 ++++++++++++++++++++++++++++-------------------
>>   1 file changed, 51 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 389aa0727cc6..21b125020b3b 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1635,6 +1635,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
>>   	return true;
>>   }
>>   
>> +static u32 tun_do_xdp(struct tun_struct *tun,
>> +		      struct tun_file *tfile,
>> +		      struct bpf_prog *xdp_prog,
>> +		      struct xdp_buff *xdp,
>> +		      int *err)
>> +{
>> +	u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
>> +
>> +	switch (act) {
>> +	case XDP_REDIRECT:
>> +		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
>> +		xdp_do_flush_map();
>> +		if (*err)
>> +			break;
>> +		goto out;
>> +	case XDP_TX:
>> +		*err = tun_xdp_tx(tun->dev, xdp);
>> +		if (*err < 0)
>> +			break;
>> +		*err = 0;
>> +		goto out;
>> +	case XDP_PASS:
>> +		goto out;
> Do we need goto? why not just return?

I don't see any difference.

>
>> +	default:
>> +		bpf_warn_invalid_xdp_action(act);
>> +		/* fall through */
>> +	case XDP_ABORTED:
>> +		trace_xdp_exception(tun->dev, xdp_prog, act);
>> +		/* fall through */
>> +	case XDP_DROP:
>> +		break;
>> +	}
>> +
>> +	put_page(virt_to_head_page(xdp->data_hard_start));
> put here because caller does get_page :( Not pretty.
> I'd move this out to the caller.

Then we need a switch in the caller, not sure it's better.

>
>> +out:
>> +	return act;
> How about combining err and act? err is < 0 XDP_PASS is > 0.
> No need for pointers then.

Ok.

>
>> +}
>> +
>>   static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   				     struct tun_file *tfile,
>>   				     struct iov_iter *from,
>> @@ -1645,10 +1683,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	struct sk_buff *skb = NULL;
>>   	struct bpf_prog *xdp_prog;
>>   	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> -	unsigned int delta = 0;
>>   	char *buf;
>>   	size_t copied;
>> -	int err, pad = TUN_RX_PAD;
>> +	int pad = TUN_RX_PAD;
>> +	int err = 0;
>>   
>>   	rcu_read_lock();
>>   	xdp_prog = rcu_dereference(tun->xdp_prog);
>> @@ -1685,9 +1723,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   	local_bh_disable();
>>   	rcu_read_lock();
>>   	xdp_prog = rcu_dereference(tun->xdp_prog);
>> -	if (xdp_prog && !*skb_xdp) {
>> +	if (xdp_prog) {
>>   		struct xdp_buff xdp;
>> -		void *orig_data;
>>   		u32 act;
>>   
>>   		xdp.data_hard_start = buf;
>> @@ -1695,33 +1732,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   		xdp_set_data_meta_invalid(&xdp);
>>   		xdp.data_end = xdp.data + len;
>>   		xdp.rxq = &tfile->xdp_rxq;
>> -		orig_data = xdp.data;
>> -		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> -
>> -		switch (act) {
>> -		case XDP_REDIRECT:
>> -			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
>> -			xdp_do_flush_map();
>> -			if (err)
>> -				goto err_xdp;
>> -			goto out;
>> -		case XDP_TX:
>> -			if (tun_xdp_tx(tun->dev, &xdp) < 0)
>> -				goto err_xdp;
>> -			goto out;
>> -		case XDP_PASS:
>> -			delta = orig_data - xdp.data;
>> -			len = xdp.data_end - xdp.data;
>> -			break;
>> -		default:
>> -			bpf_warn_invalid_xdp_action(act);
>> -			/* fall through */
>> -		case XDP_ABORTED:
>> -			trace_xdp_exception(tun->dev, xdp_prog, act);
>> -			/* fall through */
>> -		case XDP_DROP:
>> +		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
>> +		if (err)
>>   			goto err_xdp;
>> -		}
>> +		if (act != XDP_PASS)
>> +			goto out;
> likely?

It depends on the XDP program, so I tend not to use it.

>
>> +
>> +		pad = xdp.data - xdp.data_hard_start;
>> +		len = xdp.data_end - xdp.data;
>>   	}
>>   	rcu_read_unlock();
>>   	local_bh_enable();
>> @@ -1729,18 +1747,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   build:
>>   	skb = build_skb(buf, buflen);
>>   	if (!skb) {
>> +		put_page(alloc_frag->page);
>>   		skb = ERR_PTR(-ENOMEM);
>>   		goto out;
>>   	}
>>   
>> -	skb_reserve(skb, pad - delta);
>> +	skb_reserve(skb, pad);
>>   	skb_put(skb, len);
>>   
>>   	return skb;
>>   
>>   err_xdp:
>> -	alloc_frag->offset -= buflen;
>> -	put_page(alloc_frag->page);
>> +	this_cpu_inc(tun->pcpu_stats->rx_dropped);
>
> This fixes bug in previous patch which dropped it. OK :)

Yes, but let me move this to the buggy patch.

Thanks

>>   out:
>>   	rcu_read_unlock();
>>   	local_bh_enable();
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
From: Jason Wang @ 2018-09-07  3:31 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906132209-mutt-send-email-mst@kernel.org>



On 2018年09月07日 01:48, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:22PM +0800, Jason Wang wrote:
>> This will allow adding batch flushing on top.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tun.c | 4 +++-
>>   1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 21b125020b3b..ff1cbf3ebd50 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
>>   	switch (act) {
>>   	case XDP_REDIRECT:
>>   		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
>> -		xdp_do_flush_map();
>>   		if (*err)
>>   			break;
>>   		goto out;
>> @@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>>   		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
>>   		if (err)
>>   			goto err_xdp;
>> +
>> +		if (act == XDP_REDIRECT)
>> +			xdp_do_flush_map();
>>   		if (act != XDP_PASS)
>>   			goto out;
> At this point the switch statement which used to contain all XDP things
> seems to be gone completely. Just rewrite with a bunch of if statements
> and all xdp handling spread out to where it makes sense?

But tun_do_xdp() will be reused in the batching path.

Thanks

>
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 08/11] tun: switch to new type of msg_control
From: Jason Wang @ 2018-09-07  3:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906125051-mutt-send-email-mst@kernel.org>



On 2018年09月07日 00:54, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:23PM +0800, Jason Wang wrote:
>> This patch introduces to a new tun/tap specific msg_control:
>>
>> #define TUN_MSG_UBUF 1
>> #define TUN_MSG_PTR  2
>> struct tun_msg_ctl {
>>         int type;
>>         void *ptr;
>> };
>>
>> This allows us to pass different kinds of msg_control through
>> sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will
>> be used by the existed vhost_net zerocopy code. The second is XDP
>> buff, which allows vhost_net to pass XDP buff to TUN. This could be
>> used to implement accepting an array of XDP buffs from vhost_net in
>> the following patches.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> At this point, do we want to just add a new sock opt for tap's
> benefit? Seems cleaner than (ab)using sendmsg.

I think it won't be much difference, we still need a void pointer.

>> ---
>>   drivers/net/tap.c      | 18 ++++++++++++------
>>   drivers/net/tun.c      |  6 +++++-
>>   drivers/vhost/net.c    |  7 +++++--
>>   include/linux/if_tun.h |  7 +++++++
>>   4 files changed, 29 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index f0f7cd977667..7996ed7cbf18 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
>>   #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
>>   
>>   /* Get packet from user space buffer */
>> -static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
>> +static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
>>   			    struct iov_iter *from, int noblock)
>>   {
>>   	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
>> @@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
>>   	if (unlikely(len < ETH_HLEN))
>>   		goto err;
>>   
>> -	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
>> +	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
>>   		struct iov_iter i;
>>   
>>   		copylen = vnet_hdr.hdr_len ?
>> @@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
>>   	tap = rcu_dereference(q->tap);
>>   	/* copy skb_ubuf_info for callback when skb has no error */
>>   	if (zerocopy) {
>> -		skb_shinfo(skb)->destructor_arg = m->msg_control;
>> +		skb_shinfo(skb)->destructor_arg = msg_control;
>>   		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
>>   		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
>> -	} else if (m && m->msg_control) {
>> -		struct ubuf_info *uarg = m->msg_control;
>> +	} else if (msg_control) {
>> +		struct ubuf_info *uarg = msg_control;
>>   		uarg->callback(uarg, false);
>>   	}
>>   
>> @@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m,
>>   		       size_t total_len)
>>   {
>>   	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
>> -	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
>> +	struct tun_msg_ctl *ctl = m->msg_control;
>> +
>> +	if (ctl && ctl->type != TUN_MSG_UBUF)
>> +		return -EINVAL;
>> +
>> +	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
>> +			    m->msg_flags & MSG_DONTWAIT);
>>   }
>>   
>>   static int tap_recvmsg(struct socket *sock, struct msghdr *m,
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index ff1cbf3ebd50..c839a4bdcbd9 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -2429,11 +2429,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>>   	int ret;
>>   	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>>   	struct tun_struct *tun = tun_get(tfile);
>> +	struct tun_msg_ctl *ctl = m->msg_control;
>>   
>>   	if (!tun)
>>   		return -EBADFD;
>>   
>> -	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
>> +	if (ctl && ctl->type != TUN_MSG_UBUF)
>> +		return -EINVAL;
>> +
>> +	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>>   			   m->msg_flags & MSG_DONTWAIT,
>>   			   m->msg_flags & MSG_MORE);
>>   	tun_put(tun);
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 4e656f89cb22..fb01ce6d981c 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>>   		.msg_controllen = 0,
>>   		.msg_flags = MSG_DONTWAIT,
>>   	};
>> +	struct tun_msg_ctl ctl;
>>   	size_t len, total_len = 0;
>>   	int err;
>>   	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>> @@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>>   			ubuf->ctx = nvq->ubufs;
>>   			ubuf->desc = nvq->upend_idx;
>>   			refcount_set(&ubuf->refcnt, 1);
>> -			msg.msg_control = ubuf;
>> -			msg.msg_controllen = sizeof(ubuf);
>> +			msg.msg_control = &ctl;
>> +			ctl.type = TUN_MSG_UBUF;
>> +			ctl.ptr = ubuf;
>> +			msg.msg_controllen = sizeof(ctl);
>>   			ubufs = nvq->ubufs;
>>   			atomic_inc(&ubufs->refcount);
>>   			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
>> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
>> index 3d2996dc7d85..ba46dced1f38 100644
>> --- a/include/linux/if_tun.h
>> +++ b/include/linux/if_tun.h
>> @@ -19,6 +19,13 @@
>>   
>>   #define TUN_XDP_FLAG 0x1UL
>>   
>> +#define TUN_MSG_UBUF 1
>> +#define TUN_MSG_PTR  2
> Looks like TUN_MSG_PTR should be pushed out to a follow-up patch?

Ok.

>
>> +struct tun_msg_ctl {
>> +	int type;
>> +	void *ptr;
>> +};
>> +
> type actually includes a size. Why not two short fields then?

Yes, this sounds better.

Thanks

>
>>   #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
>>   struct socket *tun_get_socket(struct file *);
>>   struct ptr_ring *tun_get_tx_ring(struct file *file);
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
From: Jason Wang @ 2018-09-07  3:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906135450-mutt-send-email-mst@kernel.org>



On 2018年09月07日 02:00, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:25PM +0800, Jason Wang wrote:
>> This patch implement TUN_MSG_PTR msg_control type. This type allows
>> the caller to pass an array of XDP buffs to tuntap through ptr field
>> of the tun_msg_control. Tap will build skb through those XDP buffers.
>>
>> This will avoid lots of indirect calls thus improves the icache
>> utilization and allows to do XDP batched flushing when doing XDP
>> redirection.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 71 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 7996ed7cbf18..50eb7bf22225 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
>>   #endif
>>   };
>>   
>> +static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
>> +{
>> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
>> +	int buflen = *(int *)xdp->data_hard_start;
>> +	int vnet_hdr_len = 0;
>> +	struct tap_dev *tap;
>> +	struct sk_buff *skb;
>> +	int err, depth;
>> +
>> +	if (q->flags & IFF_VNET_HDR)
>> +		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
>> +
>> +	skb = build_skb(xdp->data_hard_start, buflen);
>> +	if (!skb) {
>> +		err = -ENOMEM;
>> +		goto err;
>> +	}
> So fundamentally why is it called XDP?
> We just build and skb, don't we?

The reason is that the function accepts a pointer to XDP. And also the 
for the future development, I think the name is ok:

- we will probably do XDP offloading in this function.
- we may have a chance to call lower device's ndo_xdp_xmit() in the future.

Thanks

>
>> +
>> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
>> +	skb_put(skb, xdp->data_end - xdp->data);
>> +
>> +	skb_set_network_header(skb, ETH_HLEN);
>> +	skb_reset_mac_header(skb);
>> +	skb->protocol = eth_hdr(skb)->h_proto;
>> +
>> +	if (vnet_hdr_len) {
>> +		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
>> +		if (err)
>> +			goto err_kfree;
>> +	}
>> +
>> +	skb_probe_transport_header(skb, ETH_HLEN);
>> +
>> +	/* Move network header to the right position for VLAN tagged packets */
>> +	if ((skb->protocol == htons(ETH_P_8021Q) ||
>> +	     skb->protocol == htons(ETH_P_8021AD)) &&
>> +	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
>> +		skb_set_network_header(skb, depth);
>> +
>> +	rcu_read_lock();
>> +	tap = rcu_dereference(q->tap);
>> +	if (tap) {
>> +		skb->dev = tap->dev;
>> +		dev_queue_xmit(skb);
>> +	} else {
>> +		kfree_skb(skb);
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	return 0;
>> +
>> +err_kfree:
>> +	kfree_skb(skb);
>> +err:
>> +	rcu_read_lock();
>> +		tap = rcu_dereference(q->tap);
>> +	if (tap && tap->count_tx_dropped)
>> +		tap->count_tx_dropped(tap);
>> +	rcu_read_unlock();
>> +	return err;
>> +}
>> +
>>   static int tap_sendmsg(struct socket *sock, struct msghdr *m,
>>   		       size_t total_len)
>>   {
>>   	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
>>   	struct tun_msg_ctl *ctl = m->msg_control;
>> +	struct xdp_buff *xdp;
>> +	int i;
>>   
>> -	if (ctl && ctl->type != TUN_MSG_UBUF)
>> -		return -EINVAL;
>> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
>> +		for (i = 0; i < ctl->type >> 16; i++) {
>> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
>> +			tap_get_user_xdp(q, xdp);
>> +		}
>> +		return 0;
>> +	}
>>   
>>   	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
>>   			    m->msg_flags & MSG_DONTWAIT);
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
From: Jason Wang @ 2018-09-07  7:33 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906134834-mutt-send-email-mst@kernel.org>



On 2018年09月07日 01:51, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
>> This patch implement TUN_MSG_PTR msg_control type. This type allows
>> the caller to pass an array of XDP buffs to tuntap through ptr field
>> of the tun_msg_control. If an XDP program is attached, tuntap can run
>> XDP program directly. If not, tuntap will build skb and do a fast
>> receiving since part of the work has been done by vhost_net.
>>
>> This will avoid lots of indirect calls thus improves the icache
>> utilization and allows to do XDP batched flushing when doing XDP
>> redirection.
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Is most of the benefit in batched flushing or skipping
> indirect calls? Because if it's flushing we can gain
> most of it easily by adding an analog of xmit_more.
>

Should be both. XDP_DROP doesn't flush but it gives obvious improvement.

Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH -next] drm/virtio: Remove set but not used variable 'bo'
From: Gerd Hoffmann @ 2018-09-07  7:34 UTC (permalink / raw)
  To: YueHaibing
  Cc: David Airlie, kernel-janitors, linux-kernel, dri-devel,
	virtualization
In-Reply-To: <1536285837-150460-1-git-send-email-yuehaibing@huawei.com>

On Fri, Sep 07, 2018 at 02:03:57AM +0000, YueHaibing wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/gpu/drm/virtio/virtgpu_display.c: In function 'virtio_gpu_framebuffer_init':
> drivers/gpu/drm/virtio/virtgpu_display.c:78:28: warning:
>  variable 'bo' set but not used [-Wunused-but-set-variable]
>   struct virtio_gpu_object *bo;
>                             ^

Patch queued.

thanks,
  Gerd

^ permalink raw reply

* Re: [PATCH net-next 11/11] vhost_net: batch submitting XDP buffers to underlayer sockets
From: Jason Wang @ 2018-09-07  7:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906122857-mutt-send-email-mst@kernel.org>



On 2018年09月07日 00:46, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:26PM +0800, Jason Wang wrote:
>> This patch implements XDP batching for vhost_net. The idea is first to
>> try to do userspace copy and build XDP buff directly in vhost. Instead
>> of submitting the packet immediately, vhost_net will batch them in an
>> array and submit every 64 (VHOST_NET_BATCH) packets to the under layer
>> sockets through msg_control of sendmsg().
>>
>> When XDP is enabled on the TUN/TAP, TUN/TAP can process XDP inside a
>> loop without caring GUP thus it can do batch map flushing. When XDP is
>> not enabled or not supported, the underlayer socket need to build skb
>> and pass it to network core. The batched packet submission allows us
>> to do batching like netif_receive_skb_list() in the future.
>>
>> This saves lots of indirect calls for better cache utilization. For
>> the case that we can't so batching e.g when sndbuf is limited or
>> packet size is too large, we will go for usual one packet per
>> sendmsg() way.
>>
>> Doing testpmd on various setups gives us:
>>
>> Test                /+pps%
>> XDP_DROP on TAP     /+44.8%
>> XDP_REDIRECT on TAP /+29%
>> macvtap (skb)       /+26%
>>
>> Netperf tests shows obvious improvements for small packet transmission:
>>
>> size/session/+thu%/+normalize%
>>     64/     1/   +2%/    0%
>>     64/     2/   +3%/   +1%
>>     64/     4/   +7%/   +5%
>>     64/     8/   +8%/   +6%
>>    256/     1/   +3%/    0%
>>    256/     2/  +10%/   +7%
>>    256/     4/  +26%/  +22%
>>    256/     8/  +27%/  +23%
>>    512/     1/   +3%/   +2%
>>    512/     2/  +19%/  +14%
>>    512/     4/  +43%/  +40%
>>    512/     8/  +45%/  +41%
>>   1024/     1/   +4%/    0%
>>   1024/     2/  +27%/  +21%
>>   1024/     4/  +38%/  +73%
>>   1024/     8/  +15%/  +24%
>>   2048/     1/  +10%/   +7%
>>   2048/     2/  +16%/  +12%
>>   2048/     4/    0%/   +2%
>>   2048/     8/    0%/   +2%
>>   4096/     1/  +36%/  +60%
>>   4096/     2/  -11%/  -26%
>>   4096/     4/    0%/  +14%
>>   4096/     8/    0%/   +4%
>> 16384/     1/   -1%/   +5%
>> 16384/     2/    0%/   +2%
>> 16384/     4/    0%/   -3%
>> 16384/     8/    0%/   +4%
>> 65535/     1/    0%/  +10%
>> 65535/     2/    0%/   +8%
>> 65535/     4/    0%/   +1%
>> 65535/     8/    0%/   +3%
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/net.c | 164 ++++++++++++++++++++++++++++++++++++++++----
>>   1 file changed, 151 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index fb01ce6d981c..1dd4239cbff8 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -116,6 +116,7 @@ struct vhost_net_virtqueue {
>>   	 * For RX, number of batched heads
>>   	 */
>>   	int done_idx;
>> +	int batched_xdp;
> Pls add a comment documenting what does this new field do.

Ok.

>
>>   	/* an array of userspace buffers info */
>>   	struct ubuf_info *ubuf_info;
>>   	/* Reference counting for outstanding ubufs.
>> @@ -123,6 +124,7 @@ struct vhost_net_virtqueue {
>>   	struct vhost_net_ubuf_ref *ubufs;
>>   	struct ptr_ring *rx_ring;
>>   	struct vhost_net_buf rxq;
>> +	struct xdp_buff xdp[VHOST_NET_BATCH];
> This is a bit too much to have inline. 64 entries 48 bytes each, and we
> have 2 of these structures, that's already >4K.

Let me allocate it elsewhere.

>
>>   };
>>   
>>   struct vhost_net {
>> @@ -338,6 +340,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
>>   		sock_flag(sock->sk, SOCK_ZEROCOPY);
>>   }
>>   
>> +static bool vhost_sock_xdp(struct socket *sock)
>> +{
>> +	return sock_flag(sock->sk, SOCK_XDP);
> what if an xdp program is attached while this processing
> is going on? Flag value will be wrong - is this safe
> and why? Pls add a comment.

Ok, let me add comment to explain. It's safe and may just lead few 
packets to be dropped if XDP program want to extend packet's header.

>
>> +}
>> +
>>   /* In case of DMA done not in order in lower device driver for some reason.
>>    * upend_idx is used to track end of used idx, done_idx is used to track head
>>    * of used idx. Once lower device DMA done contiguously, we will signal KVM
>> @@ -444,10 +451,36 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
>>   	nvq->done_idx = 0;
>>   }
>>   
>> +static void vhost_tx_batch(struct vhost_net *net,
>> +			   struct vhost_net_virtqueue *nvq,
>> +			   struct socket *sock,
>> +			   struct msghdr *msghdr)
>> +{
>> +	struct tun_msg_ctl ctl = {
>> +		.type = nvq->batched_xdp << 16 | TUN_MSG_PTR,
>> +		.ptr = nvq->xdp,
>> +	};
>> +	int err;
>> +
>> +	if (nvq->batched_xdp == 0)
>> +		goto signal_used;
>> +
>> +	msghdr->msg_control = &ctl;
>> +	err = sock->ops->sendmsg(sock, msghdr, 0);
>> +	if (unlikely(err < 0)) {
>> +		vq_err(&nvq->vq, "Fail to batch sending packets\n");
>> +		return;
>> +	}
>> +
>> +signal_used:
>> +	vhost_net_signal_used(nvq);
>> +	nvq->batched_xdp = 0;
>> +}
>> +
>>   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>>   				    struct vhost_net_virtqueue *nvq,
>>   				    unsigned int *out_num, unsigned int *in_num,
>> -				    bool *busyloop_intr)
>> +				    struct msghdr *msghdr, bool *busyloop_intr)
>>   {
>>   	struct vhost_virtqueue *vq = &nvq->vq;
>>   	unsigned long uninitialized_var(endtime);
>> @@ -455,8 +488,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>>   				  out_num, in_num, NULL, NULL);
>>   
>>   	if (r == vq->num && vq->busyloop_timeout) {
>> +		/* Flush batched packets first */
>>   		if (!vhost_sock_zcopy(vq->private_data))
>> -			vhost_net_signal_used(nvq);
>> +			vhost_tx_batch(net, nvq, vq->private_data, msghdr);
>>   		preempt_disable();
>>   		endtime = busy_clock() + vq->busyloop_timeout;
>>   		while (vhost_can_busy_poll(endtime)) {
>> @@ -512,7 +546,7 @@ static int get_tx_bufs(struct vhost_net *net,
>>   	struct vhost_virtqueue *vq = &nvq->vq;
>>   	int ret;
>>   
>> -	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
>> +	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
>>   
>>   	if (ret < 0 || ret == vq->num)
>>   		return ret;
>> @@ -540,6 +574,83 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
>>   	       !vhost_vq_avail_empty(vq->dev, vq);
>>   }
>>   
>> +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> I wonder whether NET_IP_ALIGN make sense for XDP.

XDP is not the only consumer, socket may build skb based on this.

>
>> +
>> +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
>> +			       struct iov_iter *from)
>> +{
>> +	struct vhost_virtqueue *vq = &nvq->vq;
>> +	struct socket *sock = vq->private_data;
>> +	struct page_frag *alloc_frag = &current->task_frag;
>> +	struct virtio_net_hdr *gso;
>> +	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
>> +	size_t len = iov_iter_count(from);
>> +	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
>> +	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
>> +	int sock_hlen = nvq->sock_hlen;
>> +	void *buf;
>> +	int copied;
>> +
>> +	if (unlikely(len < nvq->sock_hlen))
>> +		return -EFAULT;
>> +
>> +	if (SKB_DATA_ALIGN(len + pad) +
>> +	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
>> +		return -ENOSPC;
>> +
>> +	buflen += SKB_DATA_ALIGN(len + pad);
>> +	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
>> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
>> +		return -ENOMEM;
>> +
>> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> +
>> +	/* We store two kinds of metadata in the header which will be
>> +	 * used for XDP_PASS to do build_skb():
>> +	 * offset 0: buflen
>> +	 * offset sizeof(int): vnet header
> Define a struct for the metadata then?

Ok.

>
>
>> +	 */
>> +	copied = copy_page_from_iter(alloc_frag->page,
>> +				     alloc_frag->offset + sizeof(int),
>> +				     sock_hlen, from);
>> +	if (copied != sock_hlen)
>> +		return -EFAULT;
>> +
>> +	gso = (struct virtio_net_hdr *)(buf + sizeof(int));
>> +
>> +	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
>> +	    vhost16_to_cpu(vq, gso->csum_start) +
>> +	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
>> +	    vhost16_to_cpu(vq, gso->hdr_len)) {
>> +		gso->hdr_len = cpu_to_vhost16(vq,
>> +			       vhost16_to_cpu(vq, gso->csum_start) +
>> +			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
>> +
>> +		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
>> +			return -EINVAL;
>> +	}
>> +
>> +	len -= sock_hlen;
>> +	copied = copy_page_from_iter(alloc_frag->page,
>> +				     alloc_frag->offset + pad,
>> +				     len, from);
>> +	if (copied != len)
>> +		return -EFAULT;
>> +
>> +	xdp->data_hard_start = buf;
>> +	xdp->data = buf + pad;
>> +	xdp->data_end = xdp->data + len;
>> +	*(int *)(xdp->data_hard_start) = buflen;
>> +
>> +	get_page(alloc_frag->page);
>> +	alloc_frag->offset += buflen;
>> +
>> +	++nvq->batched_xdp;
>> +
>> +	return 0;
>> +}
>> +
>>   static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>>   {
>>   	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>> @@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>>   	size_t len, total_len = 0;
>>   	int err;
>>   	int sent_pkts = 0;
>> +	bool bulking = (sock->sk->sk_sndbuf == INT_MAX);
> What does bulking mean?

The name is misleading, it means whether we can do batching. For 
simplicity, I disable batching is sndbuf is not INT_MAX.

>>   
>>   	for (;;) {
>>   		bool busyloop_intr = false;
>>   
>> +		if (nvq->done_idx == VHOST_NET_BATCH)
>> +			vhost_tx_batch(net, nvq, sock, &msg);
>> +
>>   		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
>>   				   &busyloop_intr);
>>   		/* On error, stop handling until the next kick. */
>> @@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>>   			break;
>>   		}
>>   
>> -		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
>> -		vq->heads[nvq->done_idx].len = 0;
>> -
>>   		total_len += len;
>> -		if (tx_can_batch(vq, total_len))
>> -			msg.msg_flags |= MSG_MORE;
>> -		else
>> -			msg.msg_flags &= ~MSG_MORE;
>> +
>> +		/* For simplicity, TX batching is only enabled if
>> +		 * sndbuf is unlimited.
> What if sndbuf changes while this processing is going on?

We will get the correct sndbuf in the next run of handle_tx(). I think 
this is safe.

Thanks

>
>> +		 */
>> +		if (bulking) {
>> +			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
>> +			if (!err) {
>> +				goto done;
>> +			} else if (unlikely(err != -ENOSPC)) {
>> +				vhost_tx_batch(net, nvq, sock, &msg);
>> +				vhost_discard_vq_desc(vq, 1);
>> +				vhost_net_enable_vq(net, vq);
>> +				break;
>> +			}
>> +
>> +			/* We can't build XDP buff, go for single
>> +			 * packet path but let's flush batched
>> +			 * packets.
>> +			 */
>> +			vhost_tx_batch(net, nvq, sock, &msg);
>> +			msg.msg_control = NULL;
>> +		} else {
>> +			if (tx_can_batch(vq, total_len))
>> +				msg.msg_flags |= MSG_MORE;
>> +			else
>> +				msg.msg_flags &= ~MSG_MORE;
>> +		}
>>   
>>   		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>>   		err = sock->ops->sendmsg(sock, &msg, len);
>> @@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
>>   		if (err != len)
>>   			pr_debug("Truncated TX packet: len %d != %zd\n",
>>   				 err, len);
>> -		if (++nvq->done_idx >= VHOST_NET_BATCH)
>> -			vhost_net_signal_used(nvq);
>> +done:
>> +		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
>> +		vq->heads[nvq->done_idx].len = 0;
>> +		++nvq->done_idx;
>>   		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
>>   			vhost_poll_queue(&vq->poll);
>>   			break;
>>   		}
>>   	}
>>   
>> -	vhost_net_signal_used(nvq);
>> +	vhost_tx_batch(net, nvq, sock, &msg);
>>   }
>>   
>>   static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>> @@ -1111,6 +1248,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>>   		n->vqs[i].ubuf_info = NULL;
>>   		n->vqs[i].upend_idx = 0;
>>   		n->vqs[i].done_idx = 0;
>> +		n->vqs[i].batched_xdp = 0;
>>   		n->vqs[i].vhost_hlen = 0;
>>   		n->vqs[i].sock_hlen = 0;
>>   		n->vqs[i].rx_ring = NULL;
>> -- 
>> 2.17.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] bochs: convert to drm_fb_helper_fbdev_setup/teardown
From: kbuild test robot @ 2018-09-07  8:10 UTC (permalink / raw)
  To: Peter Wu
  Cc: David Airlie, linux-kernel, dri-devel, virtualization, kbuild-all
In-Reply-To: <20180905144127.5690-1-peter@lekensteyn.nl>

[-- Attachment #1: Type: text/plain, Size: 12327 bytes --]

Hi Peter,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v4.19-rc2 next-20180906]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Peter-Wu/bochs-convert-to-drm_fb_helper_fbdev_setup-teardown/20180907-154819
config: i386-randconfig-x006-201835 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/bug.h:5:0,
                    from arch/x86/include/asm/bug.h:83,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/mm.h:9,
                    from drivers/gpu/drm/bochs/bochs_drv.c:8:
   drivers/gpu/drm/bochs/bochs_drv.c: In function 'bochs_pm_suspend':
   drivers/gpu/drm/bochs/bochs_drv.c:110:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> drivers/gpu/drm/bochs/bochs_drv.c:110:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~
   drivers/gpu/drm/bochs/bochs_drv.c:110:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:58:42: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                             ^~~~
>> drivers/gpu/drm/bochs/bochs_drv.c:110:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~
   drivers/gpu/drm/bochs/bochs_drv.c:110:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:69:16: note: in definition of macro '__trace_if'
      ______r = !!(cond);     \
                   ^~~~
>> drivers/gpu/drm/bochs/bochs_drv.c:110:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~
   drivers/gpu/drm/bochs/bochs_drv.c: In function 'bochs_pm_resume':
   drivers/gpu/drm/bochs/bochs_drv.c:127:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
   drivers/gpu/drm/bochs/bochs_drv.c:127:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~
   drivers/gpu/drm/bochs/bochs_drv.c:127:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:58:42: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                             ^~~~
   drivers/gpu/drm/bochs/bochs_drv.c:127:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~
   drivers/gpu/drm/bochs/bochs_drv.c:127:15: error: 'struct <anonymous>' has no member named 'initialized'
     if (bochs->fb.initialized) {
                  ^
   include/linux/compiler.h:69:16: note: in definition of macro '__trace_if'
      ______r = !!(cond);     \
                   ^~~~
   drivers/gpu/drm/bochs/bochs_drv.c:127:2: note: in expansion of macro 'if'
     if (bochs->fb.initialized) {
     ^~

vim +/if +110 drivers/gpu/drm/bochs/bochs_drv.c

0a6659bd Gerd Hoffmann           2013-12-17   @8  #include <linux/mm.h>
0a6659bd Gerd Hoffmann           2013-12-17    9  #include <linux/module.h>
0a6659bd Gerd Hoffmann           2013-12-17   10  #include <linux/slab.h>
44adece5 Daniel Vetter           2016-08-10   11  #include <drm/drm_fb_helper.h>
0a6659bd Gerd Hoffmann           2013-12-17   12  
0a6659bd Gerd Hoffmann           2013-12-17   13  #include "bochs.h"
0a6659bd Gerd Hoffmann           2013-12-17   14  
1acf5661 Max Staudt              2017-01-18   15  static int bochs_modeset = -1;
1acf5661 Max Staudt              2017-01-18   16  module_param_named(modeset, bochs_modeset, int, 0444);
1acf5661 Max Staudt              2017-01-18   17  MODULE_PARM_DESC(modeset, "enable/disable kernel modesetting");
1acf5661 Max Staudt              2017-01-18   18  
0a6659bd Gerd Hoffmann           2013-12-17   19  static bool enable_fbdev = true;
0a6659bd Gerd Hoffmann           2013-12-17   20  module_param_named(fbdev, enable_fbdev, bool, 0444);
0a6659bd Gerd Hoffmann           2013-12-17   21  MODULE_PARM_DESC(fbdev, "register fbdev device");
0a6659bd Gerd Hoffmann           2013-12-17   22  
0a6659bd Gerd Hoffmann           2013-12-17   23  /* ---------------------------------------------------------------------- */
0a6659bd Gerd Hoffmann           2013-12-17   24  /* drm interface                                                          */
0a6659bd Gerd Hoffmann           2013-12-17   25  
11b3c20b Gabriel Krisman Bertazi 2017-01-06   26  static void bochs_unload(struct drm_device *dev)
0a6659bd Gerd Hoffmann           2013-12-17   27  {
0a6659bd Gerd Hoffmann           2013-12-17   28  	struct bochs_device *bochs = dev->dev_private;
0a6659bd Gerd Hoffmann           2013-12-17   29  
0a6659bd Gerd Hoffmann           2013-12-17   30  	bochs_fbdev_fini(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   31  	bochs_kms_fini(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   32  	bochs_mm_fini(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   33  	bochs_hw_fini(dev);
0a6659bd Gerd Hoffmann           2013-12-17   34  	kfree(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   35  	dev->dev_private = NULL;
0a6659bd Gerd Hoffmann           2013-12-17   36  }
0a6659bd Gerd Hoffmann           2013-12-17   37  
0a6659bd Gerd Hoffmann           2013-12-17   38  static int bochs_load(struct drm_device *dev, unsigned long flags)
0a6659bd Gerd Hoffmann           2013-12-17   39  {
0a6659bd Gerd Hoffmann           2013-12-17   40  	struct bochs_device *bochs;
0a6659bd Gerd Hoffmann           2013-12-17   41  	int ret;
0a6659bd Gerd Hoffmann           2013-12-17   42  
0a6659bd Gerd Hoffmann           2013-12-17   43  	bochs = kzalloc(sizeof(*bochs), GFP_KERNEL);
0a6659bd Gerd Hoffmann           2013-12-17   44  	if (bochs == NULL)
0a6659bd Gerd Hoffmann           2013-12-17   45  		return -ENOMEM;
0a6659bd Gerd Hoffmann           2013-12-17   46  	dev->dev_private = bochs;
0a6659bd Gerd Hoffmann           2013-12-17   47  	bochs->dev = dev;
0a6659bd Gerd Hoffmann           2013-12-17   48  
0a6659bd Gerd Hoffmann           2013-12-17   49  	ret = bochs_hw_init(dev, flags);
0a6659bd Gerd Hoffmann           2013-12-17   50  	if (ret)
0a6659bd Gerd Hoffmann           2013-12-17   51  		goto err;
0a6659bd Gerd Hoffmann           2013-12-17   52  
0a6659bd Gerd Hoffmann           2013-12-17   53  	ret = bochs_mm_init(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   54  	if (ret)
0a6659bd Gerd Hoffmann           2013-12-17   55  		goto err;
0a6659bd Gerd Hoffmann           2013-12-17   56  
0a6659bd Gerd Hoffmann           2013-12-17   57  	ret = bochs_kms_init(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   58  	if (ret)
0a6659bd Gerd Hoffmann           2013-12-17   59  		goto err;
0a6659bd Gerd Hoffmann           2013-12-17   60  
0a6659bd Gerd Hoffmann           2013-12-17   61  	if (enable_fbdev)
0a6659bd Gerd Hoffmann           2013-12-17   62  		bochs_fbdev_init(bochs);
0a6659bd Gerd Hoffmann           2013-12-17   63  
0a6659bd Gerd Hoffmann           2013-12-17   64  	return 0;
0a6659bd Gerd Hoffmann           2013-12-17   65  
0a6659bd Gerd Hoffmann           2013-12-17   66  err:
0a6659bd Gerd Hoffmann           2013-12-17   67  	bochs_unload(dev);
0a6659bd Gerd Hoffmann           2013-12-17   68  	return ret;
0a6659bd Gerd Hoffmann           2013-12-17   69  }
0a6659bd Gerd Hoffmann           2013-12-17   70  
0a6659bd Gerd Hoffmann           2013-12-17   71  static const struct file_operations bochs_fops = {
0a6659bd Gerd Hoffmann           2013-12-17   72  	.owner		= THIS_MODULE,
0a6659bd Gerd Hoffmann           2013-12-17   73  	.open		= drm_open,
0a6659bd Gerd Hoffmann           2013-12-17   74  	.release	= drm_release,
0a6659bd Gerd Hoffmann           2013-12-17   75  	.unlocked_ioctl	= drm_ioctl,
0a6659bd Gerd Hoffmann           2013-12-17   76  	.compat_ioctl	= drm_compat_ioctl,
0a6659bd Gerd Hoffmann           2013-12-17   77  	.poll		= drm_poll,
0a6659bd Gerd Hoffmann           2013-12-17   78  	.read		= drm_read,
0a6659bd Gerd Hoffmann           2013-12-17   79  	.llseek		= no_llseek,
0a6659bd Gerd Hoffmann           2013-12-17   80  	.mmap           = bochs_mmap,
0a6659bd Gerd Hoffmann           2013-12-17   81  };
0a6659bd Gerd Hoffmann           2013-12-17   82  
0a6659bd Gerd Hoffmann           2013-12-17   83  static struct drm_driver bochs_driver = {
0a6659bd Gerd Hoffmann           2013-12-17   84  	.driver_features	= DRIVER_GEM | DRIVER_MODESET,
0a6659bd Gerd Hoffmann           2013-12-17   85  	.load			= bochs_load,
0a6659bd Gerd Hoffmann           2013-12-17   86  	.unload			= bochs_unload,
0a6659bd Gerd Hoffmann           2013-12-17   87  	.fops			= &bochs_fops,
0a6659bd Gerd Hoffmann           2013-12-17   88  	.name			= "bochs-drm",
0a6659bd Gerd Hoffmann           2013-12-17   89  	.desc			= "bochs dispi vga interface (qemu stdvga)",
0a6659bd Gerd Hoffmann           2013-12-17   90  	.date			= "20130925",
0a6659bd Gerd Hoffmann           2013-12-17   91  	.major			= 1,
0a6659bd Gerd Hoffmann           2013-12-17   92  	.minor			= 0,
6f2c1c15 Daniel Vetter           2016-05-30   93  	.gem_free_object_unlocked = bochs_gem_free_object,
0a6659bd Gerd Hoffmann           2013-12-17   94  	.dumb_create            = bochs_dumb_create,
0a6659bd Gerd Hoffmann           2013-12-17   95  	.dumb_map_offset        = bochs_dumb_mmap_offset,
0a6659bd Gerd Hoffmann           2013-12-17   96  };
0a6659bd Gerd Hoffmann           2013-12-17   97  
0a6659bd Gerd Hoffmann           2013-12-17   98  /* ---------------------------------------------------------------------- */
b8ccd70f Gerd Hoffmann           2014-04-14   99  /* pm interface                                                           */
b8ccd70f Gerd Hoffmann           2014-04-14  100  
150cee9c Russell King            2014-07-12  101  #ifdef CONFIG_PM_SLEEP
b8ccd70f Gerd Hoffmann           2014-04-14  102  static int bochs_pm_suspend(struct device *dev)
b8ccd70f Gerd Hoffmann           2014-04-14  103  {
b8ccd70f Gerd Hoffmann           2014-04-14  104  	struct pci_dev *pdev = to_pci_dev(dev);
b8ccd70f Gerd Hoffmann           2014-04-14  105  	struct drm_device *drm_dev = pci_get_drvdata(pdev);
b8ccd70f Gerd Hoffmann           2014-04-14  106  	struct bochs_device *bochs = drm_dev->dev_private;
b8ccd70f Gerd Hoffmann           2014-04-14  107  
b8ccd70f Gerd Hoffmann           2014-04-14  108  	drm_kms_helper_poll_disable(drm_dev);
b8ccd70f Gerd Hoffmann           2014-04-14  109  
b8ccd70f Gerd Hoffmann           2014-04-14 @110  	if (bochs->fb.initialized) {
b8ccd70f Gerd Hoffmann           2014-04-14  111  		console_lock();
6a752972 Archit Taneja           2015-07-31  112  		drm_fb_helper_set_suspend(&bochs->fb.helper, 1);
b8ccd70f Gerd Hoffmann           2014-04-14  113  		console_unlock();
b8ccd70f Gerd Hoffmann           2014-04-14  114  	}
b8ccd70f Gerd Hoffmann           2014-04-14  115  
b8ccd70f Gerd Hoffmann           2014-04-14  116  	return 0;
b8ccd70f Gerd Hoffmann           2014-04-14  117  }
b8ccd70f Gerd Hoffmann           2014-04-14  118  

:::::: The code at line 110 was first introduced by commit
:::::: b8ccd70f1363f7d4e49219dbc46ec973a14f49cd drm: bochs: add power management support

:::::: TO: Gerd Hoffmann <kraxel@redhat.com>
:::::: CC: Dave Airlie <airlied@redhat.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30665 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH] x86/paravirt: Get rid of patch_site label
From: Borislav Petkov @ 2018-09-07 10:49 UTC (permalink / raw)
  To: LKML; +Cc: Juergen Gross, x86, virtualization

From: Borislav Petkov <bp@suse.de>

When CONFIG_PARAVIRT_SPINLOCKS=n, it fires

  arch/x86/kernel/paravirt_patch_64.c: In function ‘native_patch’:
  arch/x86/kernel/paravirt_patch_64.c:89:1: warning: label ‘patch_site’ defined but not used [-Wunused-label]
   patch_site:

but that label can simply be removed by directly calling
paravirt_patch_insns() there.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: x86@kernel.org
Cc: virtualization@lists.linux-foundation.org
---
 arch/x86/kernel/paravirt_patch_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 5ad5bcda9dc6..f09d264cbd2d 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -76,7 +76,8 @@ unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
 			if (pv_is_native_vcpu_is_preempted()) {
 				start = start_lock_vcpu_is_preempted;
 				end   = end_lock_vcpu_is_preempted;
-				goto patch_site;
+
+				return paravirt_patch_insns(ibuf, len, start, end);
 			}
 			goto patch_default;
 #endif
@@ -86,7 +87,6 @@ patch_default: __maybe_unused
 		ret = paravirt_patch_default(type, ibuf, addr, len);
 		break;
 
-patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
 		break;
 	}
-- 
2.17.0.582.gccdcbd54c

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related

* Re: [PATCH] x86/paravirt: Get rid of patch_site label
From: Borislav Petkov @ 2018-09-07 10:51 UTC (permalink / raw)
  To: LKML; +Cc: Juergen Gross, x86, virtualization
In-Reply-To: <20180907104917.12502-1-bp@alien8.de>

On Fri, Sep 07, 2018 at 12:49:17PM +0200, Borislav Petkov wrote:
> From: Borislav Petkov <bp@suse.de>
> 
> When CONFIG_PARAVIRT_SPINLOCKS=n, it fires
> 
>   arch/x86/kernel/paravirt_patch_64.c: In function ‘native_patch’:
>   arch/x86/kernel/paravirt_patch_64.c:89:1: warning: label ‘patch_site’ defined but not used [-Wunused-label]
>    patch_site:
> 
> but that label can simply be removed by directly calling
> paravirt_patch_insns() there.

Whoops, no, it is still being used. Lemme see if I can remove all the
labels in that function.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v36 0/5] Virtio-balloon: support free page reporting
From: Dr. David Alan Gilbert @ 2018-09-07 12:29 UTC (permalink / raw)
  To: Wei Wang
  Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm,
	Michael S. Tsirkin, nilal, liliang.opensource, linux-kernel,
	virtualization, linux-mm, pbonzini, akpm, mhocko, torvalds
In-Reply-To: <5B911B03.2060602@intel.com>

* Wei Wang (wei.w.wang@intel.com) wrote:
> On 07/23/2018 10:36 PM, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (mst@redhat.com) wrote:
> > > On Fri, Jul 20, 2018 at 04:33:00PM +0800, Wei Wang wrote:
> > > > This patch series is separated from the previous "Virtio-balloon
> > > > Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> > > > implemented by this series enables the virtio-balloon driver to report
> > > > hints of guest free pages to the host. It can be used to accelerate live
> > > > migration of VMs. Here is an introduction of this usage:
> > > > 
> > > > Live migration needs to transfer the VM's memory from the source machine
> > > > to the destination round by round. For the 1st round, all the VM's memory
> > > > is transferred. From the 2nd round, only the pieces of memory that were
> > > > written by the guest (after the 1st round) are transferred. One method
> > > > that is popularly used by the hypervisor to track which part of memory is
> > > > written is to write-protect all the guest memory.
> > > > 
> > > > This feature enables the optimization by skipping the transfer of guest
> > > > free pages during VM live migration. It is not concerned that the memory
> > > > pages are used after they are given to the hypervisor as a hint of the
> > > > free pages, because they will be tracked by the hypervisor and transferred
> > > > in the subsequent round if they are used and written.
> > > > 
> > > > * Tests
> > > > - Test Environment
> > > >      Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> > > >      Guest: 8G RAM, 4 vCPU
> > > >      Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
> > > > 
> > > > - Test Results
> > > >      - Idle Guest Live Migration Time (results are averaged over 10 runs):
> > > >          - Optimization v.s. Legacy = 409ms vs 1757ms --> ~77% reduction
> > > > 	(setting page poisoning zero and enabling ksm don't affect the
> > > >           comparison result)
> > > >      - Guest with Linux Compilation Workload (make bzImage -j4):
> > > >          - Live Migration Time (average)
> > > >            Optimization v.s. Legacy = 1407ms v.s. 2528ms --> ~44% reduction
> > > >          - Linux Compilation Time
> > > >            Optimization v.s. Legacy = 5min4s v.s. 5min12s
> > > >            --> no obvious difference
> > > I'd like to see dgilbert's take on whether this kind of gain
> > > justifies adding a PV interfaces, and what kind of guest workload
> > > is appropriate.
> > > 
> > > Cc'd.
> > Well, 44% is great ... although the measurement is a bit weird.
> > 
> > a) A 2 second downtime is very large; 300-500ms is more normal
> > b) I'm not sure what the 'average' is  - is that just between a bunch of
> > repeated migrations?
> > c) What load was running in the guest during the live migration?
> > 
> > An interesting measurement to add would be to do the same test but
> > with a VM with a lot more RAM but the same load;  you'd hope the gain
> > would be even better.
> > It would be interesting, especially because the users who are interested
> > are people creating VMs allocated with lots of extra memory (for the
> > worst case) but most of the time migrating when it's fairly idle.
> > 
> > Dave
> > 
> 
> Hi Dave,
> 
> The results of the added experiments have been shown in the v37 cover
> letter.
> Could you have a look at https://lkml.org/lkml/2018/8/27/29 . Thanks.

OK, that's much better.
The ~50% reducton with a 8G VM and a real workload is great,
and it does what you expect when you put a lot more RAM in and see the
84% reduction on a guest with 128G RAM - 54s vs ~9s is a big win!

(The migrate_set_speed is a bit high, since that's in bytes/s - but it's
not important).

That looks good,

Thanks!

Dave

> Best,
> Wei
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-09-07 13:00 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180907012225.GA32677@debian>

On Fri, Sep 07, 2018 at 09:22:25AM +0800, Tiwei Bie wrote:
> On Mon, Aug 27, 2018 at 05:00:40PM +0300, Michael S. Tsirkin wrote:
> > Are there still plans to test the performance with vost pmd?
> > vhost doesn't seem to show a performance gain ...
> > 
> 
> I tried some performance tests with vhost PMD. In guest, the
> XDP program will return XDP_DROP directly. And in host, testpmd
> will do txonly fwd.
> 
> When burst size is 1 and packet size is 64 in testpmd and
> testpmd needs to iterate 5 Tx queues (but only the first two
> queues are enabled) to prepare and inject packets, I got ~12%
> performance boost (5.7Mpps -> 6.4Mpps). And if the vhost PMD
> is faster (e.g. just need to iterate the first two queues to
> prepare and inject packets), then I got similar performance
> for both rings (~9.9Mpps) (packed ring's performance can be
> lower, because it's more complicated in driver.)
> 
> I think packed ring makes vhost PMD faster, but it doesn't make
> the driver faster. In packed ring, the ring is simplified, and
> the handling of the ring in vhost (device) is also simplified,
> but things are not simplified in driver, e.g. although there is
> no desc table in the virtqueue anymore, driver still needs to
> maintain a private desc state table (which is still managed as
> a list in this patch set) to support the out-of-order desc
> processing in vhost (device).
> 
> I think this patch set is mainly to make the driver have a full
> functional support for the packed ring, which makes it possible
> to leverage the packed ring feature in vhost (device). But I'm
> not sure whether there is any other better idea, I'd like to
> hear your thoughts. Thanks!

Just this: Jens seems to report a nice gain with virtio and
vhost pmd across the board. Try to compare virtio and
virtio pmd to see what does pmd do better?


> 
> > 
> > On Wed, Jul 11, 2018 at 10:27:06AM +0800, Tiwei Bie wrote:
> > > Hello everyone,
> > > 
> > > This patch set implements packed ring support in virtio driver.
> > > 
> > > Some functional tests have been done with Jason's
> > > packed ring implementation in vhost:
> > > 
> > > https://lkml.org/lkml/2018/7/3/33
> > > 
> > > Both of ping and netperf worked as expected.
> > > 
> > > v1 -> v2:
> > > - Use READ_ONCE() to read event off_wrap and flags together (Jason);
> > > - Add comments related to ccw (Jason);
> > > 
> > > RFC (v6) -> v1:
> > > - Avoid extra virtio_wmb() in virtqueue_enable_cb_delayed_packed()
> > >   when event idx is off (Jason);
> > > - Fix bufs calculation in virtqueue_enable_cb_delayed_packed() (Jason);
> > > - Test the state of the desc at used_idx instead of last_used_idx
> > >   in virtqueue_enable_cb_delayed_packed() (Jason);
> > > - Save wrap counter (as part of queue state) in the return value
> > >   of virtqueue_enable_cb_prepare_packed();
> > > - Refine the packed ring definitions in uapi;
> > > - Rebase on the net-next tree;
> > > 
> > > RFC v5 -> RFC v6:
> > > - Avoid tracking addr/len/flags when DMA API isn't used (MST/Jason);
> > > - Define wrap counter as bool (Jason);
> > > - Use ALIGN() in vring_init_packed() (Jason);
> > > - Avoid using pointer to track `next` in detach_buf_packed() (Jason);
> > > - Add comments for barriers (Jason);
> > > - Don't enable RING_PACKED on ccw for now (noticed by Jason);
> > > - Refine the memory barrier in virtqueue_poll();
> > > - Add a missing memory barrier in virtqueue_enable_cb_delayed_packed();
> > > - Remove the hacks in virtqueue_enable_cb_prepare_packed();
> > > 
> > > RFC v4 -> RFC v5:
> > > - Save DMA addr, etc in desc state (Jason);
> > > - Track used wrap counter;
> > > 
> > > RFC v3 -> RFC v4:
> > > - Make ID allocation support out-of-order (Jason);
> > > - Various fixes for EVENT_IDX support;
> > > 
> > > RFC v2 -> RFC v3:
> > > - Split into small patches (Jason);
> > > - Add helper virtqueue_use_indirect() (Jason);
> > > - Just set id for the last descriptor of a list (Jason);
> > > - Calculate the prev in virtqueue_add_packed() (Jason);
> > > - Fix/improve desc suppression code (Jason/MST);
> > > - Refine the code layout for XXX_split/packed and wrappers (MST);
> > > - Fix the comments and API in uapi (MST);
> > > - Remove the BUG_ON() for indirect (Jason);
> > > - Some other refinements and bug fixes;
> > > 
> > > RFC v1 -> RFC v2:
> > > - Add indirect descriptor support - compile test only;
> > > - Add event suppression supprt - compile test only;
> > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > - Avoid using '%' operator (Jason);
> > > - Rename free_head -> next_avail_idx (Jason);
> > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > - Some other refinements and bug fixes;
> > > 
> > > Thanks!
> > > 
> > > Tiwei Bie (5):
> > >   virtio: add packed ring definitions
> > >   virtio_ring: support creating packed ring
> > >   virtio_ring: add packed ring support
> > >   virtio_ring: add event idx support in packed ring
> > >   virtio_ring: enable packed ring
> > > 
> > >  drivers/s390/virtio/virtio_ccw.c   |   14 +
> > >  drivers/virtio/virtio_ring.c       | 1365 ++++++++++++++++++++++------
> > >  include/linux/virtio_ring.h        |    8 +-
> > >  include/uapi/linux/virtio_config.h |    3 +
> > >  include/uapi/linux/virtio_ring.h   |   43 +
> > >  5 files changed, 1157 insertions(+), 276 deletions(-)
> > > 
> > > -- 
> > > 2.18.0
> > 
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> > For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
> > 

^ permalink raw reply

* Re: [PATCH net-next v2 3/5] virtio_ring: add packed ring support
From: Michael S. Tsirkin @ 2018-09-07 13:49 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180711022711.7090-4-tiwei.bie@intel.com>

On Wed, Jul 11, 2018 at 10:27:09AM +0800, Tiwei Bie wrote:
> This commit introduces the support (without EVENT_IDX) for
> packed ring.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  drivers/virtio/virtio_ring.c | 495 ++++++++++++++++++++++++++++++++++-
>  1 file changed, 487 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c4f8abc7445a..f317b485ba54 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -55,12 +55,21 @@
>  #define END_USE(vq)
>  #endif
>  
> +#define _VRING_DESC_F_AVAIL(b)	((__u16)(b) << 7)
> +#define _VRING_DESC_F_USED(b)	((__u16)(b) << 15)

Underscore followed by an upper case letter isn't a good idea
for a symbol. And it's not nice that this does not
use the VRING_DESC_F_USED from the header.
How about ((b) ? VRING_DESC_F_USED : 0) instead?
Is produced code worse then?

> +
>  struct vring_desc_state {
>  	void *data;			/* Data for callback. */
>  	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
>  };
>  
>  struct vring_desc_state_packed {
> +	void *data;			/* Data for callback. */
> +	struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */

Include vring_desc_state for these?

> +	int num;			/* Descriptor list length. */
> +	dma_addr_t addr;		/* Buffer DMA addr. */
> +	u32 len;			/* Buffer length. */
> +	u16 flags;			/* Descriptor flags. */

This seems only to be used for indirect. Check indirect field above
instead and drop this.

>  	int next;			/* The next desc state. */

Packing things into 16 bit integers will help reduce
cache pressure here.

Also, this is only used for unmap, so when dma API is not used,
maintaining addr and len list management is unnecessary. How about we
maintain addr/len in a separate array, avoiding accesses on unmap in that case?

Also, lots of architectures have a nop unmap in the DMA API.
See a proposed patch at the end for optimizing that case.

>  };
>  
> @@ -660,7 +669,6 @@ static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	virtio_mb(vq->weak_barriers);
>  	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
>  }

why is this changing the split queue implementation?


>  
> @@ -757,6 +765,72 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
>  		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
>  }
>  
> +static void vring_unmap_state_packed(const struct vring_virtqueue *vq,
> +				     struct vring_desc_state_packed *state)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = state->flags;
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 state->addr, state->len,
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       state->addr, state->len,
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
> +				   struct vring_packed_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);

I see no reason to use virtioXX wrappers for the packed ring.
That's there to support legacy devices. Just use leXX.

> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.

Where is virt_to_phys used? I don't see it in this patch.

> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> +	return desc;
> +}
> +
>  static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  				       struct scatterlist *sgs[],
>  				       unsigned int total_sg,
> @@ -766,47 +840,449 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  				       void *ctx,
>  				       gfp_t gfp)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	u16 head, avail_wrap_counter, id, curr;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif

Add incline helpers for this debug stuff rather than
duplicate it from split ring?


> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->next_avail_idx;
> +	avail_wrap_counter = vq->avail_wrap_counter;
> +
> +	if (virtqueue_use_indirect(_vq, total_sg))
> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> +	}


Apparently this attempts to treat indirect descriptors same as
direct ones. This is how it is in the split ring, but not in
the packed one. I think you want two separate functions, for
direct and indirect.

> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	id = vq->free_head;
> +	BUG_ON(id == vq->vring_packed.num);
> +
> +	curr = id;
> +	for (n = 0; n < out_sgs + in_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> +					       DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +				  (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> +				  _VRING_DESC_F_AVAIL(vq->avail_wrap_counter) |
> +				  _VRING_DESC_F_USED(!vq->avail_wrap_counter));

Spec says:
	The VIRTQ_DESC_F_WRITE flags bit is the only valid flag for descriptors in the
	indirect table.

All this logic isn't needed for indirect.

Also, why re-calculate avail/used flags every time? They only change
when you wrap around.


> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			i++;
> +			if (!indirect) {
> +				if (vring_use_dma_api(_vq->vdev)) {
> +					vq->desc_state_packed[curr].addr = addr;
> +					vq->desc_state_packed[curr].len =
> +						sg->length;
> +					vq->desc_state_packed[curr].flags =
> +						virtio16_to_cpu(_vq->vdev,
> +								flags);
> +				}
> +				curr = vq->desc_state_packed[curr].next;
> +
> +				if (i >= vq->vring_packed.num) {
> +					i = 0;
> +					vq->avail_wrap_counter ^= 1;
> +				}
> +			}
> +		}
> +	}
> +
> +	prev = (i > 0 ? i : vq->vring_packed.num) - 1;
> +	desc[prev].id = cpu_to_virtio16(_vq->vdev, id);

Is it easier to write this out in all descriptors, to avoid the need to
calculate prev?

> +
> +	/* Last one doesn't continue. */
> +	if (total_sg == 1)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
> +						~VRING_DESC_F_NEXT);

Wouldn't it be easier to avoid setting VRING_DESC_F_NEXT
in the first place?
	 if (n != in_sgs - 1 && n != out_sgs - 1)

must be better than writing descriptor an extra time.

> +
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +				      _VRING_DESC_F_AVAIL(avail_wrap_counter) |
> +				      _VRING_DESC_F_USED(!avail_wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
> +								   addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
> +
> +		if (vring_use_dma_api(_vq->vdev)) {
> +			vq->desc_state_packed[id].addr = addr;
> +			vq->desc_state_packed[id].len = total_sg *
> +					sizeof(struct vring_packed_desc);
> +			vq->desc_state_packed[id].flags =
> +					virtio16_to_cpu(_vq->vdev, head_flags);
> +		}
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->avail_wrap_counter ^= 1;
> +		}
> +		vq->next_avail_idx = n;
> +		vq->free_head = vq->desc_state_packed[id].next;
> +	} else {
> +		vq->next_avail_idx = i;
> +		vq->free_head = curr;
> +	}
> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state_packed[id].num = descs_used;
> +	vq->desc_state_packed[id].data = data;
> +	if (indirect)
> +		vq->desc_state_packed[id].indir_desc = desc;
> +	else
> +		vq->desc_state_packed[id].indir_desc = ctx;
> +
> +	/* A driver MUST NOT make the first descriptor in the list
> +	 * available before all subsequent descriptors comprising
> +	 * the list are made available. */
> +	virtio_wmb(vq->weak_barriers);
> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added += descs_used;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_desc_packed(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->avail_wrap_counter = avail_wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	END_USE(vq);
>  	return -EIO;
>  }
>  
>  static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 flags;
> +	bool needs_kick;
> +	u32 snapshot;
> +
> +	START_USE(vq);
> +	/* We need to expose the new flags value before checking notification
> +	 * suppressions. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	snapshot = READ_ONCE(*(u32 *)vq->vring_packed.device);
> +	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;

What are all these hard-coded things? Also either we do READ_ONCE
everywhere or nowhere. Why is this place special? Any why read 32 bit
if you only want the 16?

And doesn't sparse complain about cast to __virtio16?

> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +static void detach_buf_packed(struct vring_virtqueue *vq,
> +			      unsigned int id, void **ctx)
> +{
> +	struct vring_desc_state_packed *state = NULL;
> +	struct vring_packed_desc *desc;
> +	unsigned int curr, i;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state_packed[id].data = NULL;
> +
> +	curr = id;
> +	for (i = 0; i < vq->desc_state_packed[id].num; i++) {
> +		state = &vq->desc_state_packed[curr];
> +		vring_unmap_state_packed(vq, state);
> +		curr = state->next;
> +	}
> +
> +	BUG_ON(state == NULL);
> +	vq->vq.num_free += vq->desc_state_packed[id].num;
> +	state->next = vq->free_head;
> +	vq->free_head = id;
> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		desc = vq->desc_state_packed[id].indir_desc;
> +		if (!desc)
> +			return;
> +
> +		if (vring_use_dma_api(vq->vq.vdev)) {
> +			len = vq->desc_state_packed[id].len;
> +			for (i = 0; i < len / sizeof(struct vring_packed_desc);
> +					i++)
> +				vring_unmap_desc_packed(vq, &desc[i]);
> +		}
> +		kfree(desc);
> +		vq->desc_state_packed[id].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state_packed[id].indir_desc;
> +	}
> +}
> +
> +static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
> +				       u16 idx, bool used_wrap_counter)
> +{
> +	u16 flags;
> +	bool avail, used;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[idx].flags);
> +	avail = !!(flags & VRING_DESC_F_AVAIL);
> +	used = !!(flags & VRING_DESC_F_USED);
> +
> +	return avail == used && used == used_wrap_counter;

I think that you don't need to look at avail flag to detect a used
descriptor. The reason device writes it is to avoid confusing
*device* next time descriptor wraps.

>  }
>  
>  static inline bool more_used_packed(const struct vring_virtqueue *vq)
>  {
> -	return false;
> +	return is_used_desc_packed(vq, vq->last_used_idx,
> +			vq->used_wrap_counter);
>  }
>  
>  static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>  					  unsigned int *len,
>  					  void **ctx)
>  {
> -	return NULL;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used, id;
> +	void *ret;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used_packed(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used elements after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +	id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(id >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", id);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state_packed[id].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", id);
> +		return NULL;
> +	}
> +
> +	vq->last_used_idx += vq->desc_state_packed[id].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num) {
> +		vq->last_used_idx -= vq->vring_packed.num;
> +		vq->used_wrap_counter ^= 1;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state_packed[id].data;
> +	detach_buf_packed(vq, id, ctx);
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
>  }
>  
>  static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
>  }
>  
>  static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>  {
> -	return 0;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +	}
> +
> +	END_USE(vq);
> +	return vq->last_used_idx | ((u16)vq->used_wrap_counter << 15);
>  }
>  
> -static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned off_wrap)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	bool wrap_counter;
> +	u16 used_idx;
> +
> +	wrap_counter = off_wrap >> 15;
> +	used_idx = off_wrap & ~(1 << 15);
> +
> +	return is_used_desc_packed(vq, used_idx, wrap_counter);

These >> 15 << 15 all over the place duplicate info.
Pls put 15 in the header.

Also can you maintain the counters properly shifted?
Then just use them.


>  }
>  
>  static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>  {
> -	return false;
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> +		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->event_flags_shadow);
> +		/* We need to enable interrupts first before re-checking
> +		 * for more used buffers. */
> +		virtio_mb(vq->weak_barriers);
> +	}
> +
> +	if (more_used_packed(vq)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
>  }
>  
>  static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
>  {
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring_packed.num; i++) {
> +		if (!vq->desc_state_packed[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state_packed[i].data;
> +		detach_buf_packed(vq, i, NULL);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> +	END_USE(vq);
>  	return NULL;
>  }
>  
> @@ -1083,6 +1559,9 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> +	/* We need to enable interrupts first before re-checking
> +	 * for more used buffers. */
> +	virtio_mb(vq->weak_barriers);
>  	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
>  			    virtqueue_poll_split(_vq, last_used_idx);
>  }

Possible optimization for when dma API is in use:

--->

dma: detecting nop unmap

drivers need to maintain the dma address for unmap purposes,
but these cycles are wasted when unmap callback is not
defined. Add an API for drivers to check that and avoid
unmap completely. Debug builds still have unmap.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

----

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index a785f2507159..38b2c27c8449 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -42,6 +42,11 @@ extern void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 				 size_t size, int direction, bool map_single);
 
+static inline bool has_debug_dma_unmap(struct device *dev)
+{
+	return true;
+}
+
 extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			     int nents, int mapped_ents, int direction);
 
@@ -121,6 +126,11 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
 {
 }
 
+static inline bool has_debug_dma_unmap(struct device *dev)
+{
+	return false;
+}
+
 static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 				    int nents, int mapped_ents, int direction)
 {
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 1db6a6b46d0d..656f3e518166 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -241,6 +241,14 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	return addr;
 }
 
+static inline bool has_dma_unmap(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	return ops->unmap_page || ops->unmap_sg || ops->unmap_resource ||
+		has_dma_unmap(dev);
+}
+
 static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 					  size_t size,
 					  enum dma_data_direction dir,

^ permalink raw reply related

* Re: [PATCH net-next v2 1/5] virtio: add packed ring definitions
From: Michael S. Tsirkin @ 2018-09-07 13:51 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180711022711.7090-2-tiwei.bie@intel.com>

On Wed, Jul 11, 2018 at 10:27:07AM +0800, Tiwei Bie wrote:
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>  include/uapi/linux/virtio_config.h |  3 +++
>  include/uapi/linux/virtio_ring.h   | 43 ++++++++++++++++++++++++++++++
>  2 files changed, 46 insertions(+)
> 
> diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> index 449132c76b1c..1196e1c1d4f6 100644
> --- a/include/uapi/linux/virtio_config.h
> +++ b/include/uapi/linux/virtio_config.h
> @@ -75,6 +75,9 @@
>   */
>  #define VIRTIO_F_IOMMU_PLATFORM		33
>  
> +/* This feature indicates support for the packed virtqueue layout. */
> +#define VIRTIO_F_RING_PACKED		34
> +
>  /*
>   * Does the device support Single Root I/O Virtualization?
>   */
> diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> index 6d5d5faa989b..0254a2ba29cf 100644
> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h
> @@ -44,6 +44,10 @@
>  /* This means the buffer contains a list of buffer descriptors. */
>  #define VRING_DESC_F_INDIRECT	4
>  
> +/* Mark a descriptor as available or used. */
> +#define VRING_DESC_F_AVAIL	(1ul << 7)
> +#define VRING_DESC_F_USED	(1ul << 15)
> +
>  /* The Host uses this in used->flags to advise the Guest: don't kick me when
>   * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
>   * will still kick if it's out of buffers. */
> @@ -53,6 +57,17 @@
>   * optimization.  */
>  #define VRING_AVAIL_F_NO_INTERRUPT	1
>  
> +/* Enable events. */
> +#define VRING_EVENT_F_ENABLE	0x0
> +/* Disable events. */
> +#define VRING_EVENT_F_DISABLE	0x1
> +/*
> + * Enable events for a specific descriptor
> + * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
> + * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated.
> + */
> +#define VRING_EVENT_F_DESC	0x2
> +
>  /* We support indirect buffer descriptors */
>  #define VIRTIO_RING_F_INDIRECT_DESC	28
>  

These are for the packed ring, right? Pls prefix accordingly.
Also, you likely need macros for the wrap counters.

> @@ -171,4 +186,32 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
>  	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
>  }
>  
> +struct vring_packed_desc_event {
> +	/* Descriptor Ring Change Event Offset/Wrap Counter. */
> +	__virtio16 off_wrap;
> +	/* Descriptor Ring Change Event Flags. */
> +	__virtio16 flags;
> +};
> +
> +struct vring_packed_desc {
> +	/* Buffer Address. */
> +	__virtio64 addr;
> +	/* Buffer Length. */
> +	__virtio32 len;
> +	/* Buffer ID. */
> +	__virtio16 id;
> +	/* The flags depending on descriptor type. */
> +	__virtio16 flags;
> +};

Don't use __virtioXX types, just __leXX ones.

> +
> +struct vring_packed {
> +	unsigned int num;
> +
> +	struct vring_packed_desc *desc;
> +
> +	struct vring_packed_desc_event *driver;
> +
> +	struct vring_packed_desc_event *device;
> +};
> +
>  #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> -- 
> 2.18.0

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] virtio_ring: support creating packed ring
From: Michael S. Tsirkin @ 2018-09-07 14:03 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180711022711.7090-3-tiwei.bie@intel.com>

On Wed, Jul 11, 2018 at 10:27:08AM +0800, Tiwei Bie wrote:
> This commit introduces the support for creating packed ring.
> All split ring specific functions are added _split suffix.
> Some necessary stubs for packed ring are also added.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>

I'd rather have a patch just renaming split functions, then
add all packed stuff in as a separate patch on top.


> ---
>  drivers/virtio/virtio_ring.c | 801 +++++++++++++++++++++++------------
>  include/linux/virtio_ring.h  |   8 +-
>  2 files changed, 546 insertions(+), 263 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 814b395007b2..c4f8abc7445a 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -60,11 +60,15 @@ struct vring_desc_state {
>  	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
>  };
>  
> +struct vring_desc_state_packed {
> +	int next;			/* The next desc state. */

So this can go away with IN_ORDER?

> +};
> +
>  struct vring_virtqueue {
>  	struct virtqueue vq;
>  
> -	/* Actual memory layout for this queue */
> -	struct vring vring;
> +	/* Is this a packed ring? */
> +	bool packed;
>  
>  	/* Can we use weak barriers? */
>  	bool weak_barriers;
> @@ -86,11 +90,39 @@ struct vring_virtqueue {
>  	/* Last used index we've seen. */
>  	u16 last_used_idx;
>  
> -	/* Last written value to avail->flags */
> -	u16 avail_flags_shadow;
> +	union {
> +		/* Available for split ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring vring;
>  
> -	/* Last written value to avail->idx in guest byte order */
> -	u16 avail_idx_shadow;
> +			/* Last written value to avail->flags */
> +			u16 avail_flags_shadow;
> +
> +			/* Last written value to avail->idx in
> +			 * guest byte order. */
> +			u16 avail_idx_shadow;
> +		};

Name this field split so it's easier to detect misuse of e.g.
packed fields in split code?

> +
> +		/* Available for packed ring */
> +		struct {
> +			/* Actual memory layout for this queue. */
> +			struct vring_packed vring_packed;
> +
> +			/* Driver ring wrap counter. */
> +			bool avail_wrap_counter;
> +
> +			/* Device ring wrap counter. */
> +			bool used_wrap_counter;
> +
> +			/* Index of the next avail descriptor. */
> +			u16 next_avail_idx;
> +
> +			/* Last written value to driver->flags in
> +			 * guest byte order. */
> +			u16 event_flags_shadow;
> +		};
> +	};
>  
>  	/* How to notify other side. FIXME: commonalize hcalls! */
>  	bool (*notify)(struct virtqueue *vq);
> @@ -110,11 +142,24 @@ struct vring_virtqueue {
>  #endif
>  
>  	/* Per-descriptor state. */
> -	struct vring_desc_state desc_state[];
> +	union {
> +		struct vring_desc_state desc_state[1];
> +		struct vring_desc_state_packed desc_state_packed[1];
> +	};
>  };
>  
>  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
>  
> +static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
> +					  unsigned int total_sg)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	/* If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect. FIXME: tune this threshold */
> +	return (vq->indirect && total_sg > 1 && vq->vq.num_free);
> +}
> +
>  /*
>   * Modern virtio devices have feature bits to specify whether they need a
>   * quirk and bypass the IOMMU. If not there, just use the DMA API.
> @@ -200,8 +245,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>  			      cpu_addr, size, direction);
>  }
>  
> -static void vring_unmap_one(const struct vring_virtqueue *vq,
> -			    struct vring_desc *desc)
> +static int vring_mapping_error(const struct vring_virtqueue *vq,
> +			       dma_addr_t addr)
> +{
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return 0;
> +
> +	return dma_mapping_error(vring_dma_dev(vq), addr);
> +}
> +
> +static void vring_unmap_one_split(const struct vring_virtqueue *vq,
> +				  struct vring_desc *desc)
>  {
>  	u16 flags;
>  
> @@ -225,17 +279,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq,
>  	}
>  }
>  
> -static int vring_mapping_error(const struct vring_virtqueue *vq,
> -			       dma_addr_t addr)
> -{
> -	if (!vring_use_dma_api(vq->vq.vdev))
> -		return 0;
> -
> -	return dma_mapping_error(vring_dma_dev(vq), addr);
> -}
> -
> -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> -					 unsigned int total_sg, gfp_t gfp)
> +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +					       unsigned int total_sg,
> +					       gfp_t gfp)
>  {
>  	struct vring_desc *desc;
>  	unsigned int i;
> @@ -256,14 +302,14 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
>  	return desc;
>  }
>  
> -static inline int virtqueue_add(struct virtqueue *_vq,
> -				struct scatterlist *sgs[],
> -				unsigned int total_sg,
> -				unsigned int out_sgs,
> -				unsigned int in_sgs,
> -				void *data,
> -				void *ctx,
> -				gfp_t gfp)
> +static inline int virtqueue_add_split(struct virtqueue *_vq,
> +				      struct scatterlist *sgs[],
> +				      unsigned int total_sg,
> +				      unsigned int out_sgs,
> +				      unsigned int in_sgs,
> +				      void *data,
> +				      void *ctx,
> +				      gfp_t gfp)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  	struct scatterlist *sg;
> @@ -299,10 +345,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  
>  	head = vq->free_head;
>  
> -	/* If the host supports indirect descriptor tables, and we have multiple
> -	 * buffers, then go indirect. FIXME: tune this threshold */
> -	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> -		desc = alloc_indirect(_vq, total_sg, gfp);
> +	if (virtqueue_use_indirect(_vq, total_sg))
> +		desc = alloc_indirect_split(_vq, total_sg, gfp);
>  	else {
>  		desc = NULL;
>  		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> @@ -423,7 +467,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  	for (n = 0; n < total_sg; n++) {
>  		if (i == err_idx)
>  			break;
> -		vring_unmap_one(vq, &desc[i]);
> +		vring_unmap_one_split(vq, &desc[i]);
>  		i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
>  	}
>  
> @@ -434,6 +478,355 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  	return -EIO;
>  }
>  
> +static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 new, old;
> +	bool needs_kick;
> +
> +	START_USE(vq);
> +	/* We need to expose available array entries before checking avail
> +	 * event. */
> +	virtio_mb(vq->weak_barriers);
> +
> +	old = vq->avail_idx_shadow - vq->num_added;
> +	new = vq->avail_idx_shadow;
> +	vq->num_added = 0;
> +
> +#ifdef DEBUG
> +	if (vq->last_add_time_valid) {
> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> +					      vq->last_add_time)) > 100);
> +	}
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	if (vq->event) {
> +		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
> +					      new, old);
> +	} else {
> +		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
> +	}
> +	END_USE(vq);
> +	return needs_kick;
> +}
> +
> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> +			     void **ctx)
> +{
> +	unsigned int i, j;
> +	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> +
> +	/* Clear data ptr. */
> +	vq->desc_state[head].data = NULL;
> +
> +	/* Put back on free list: unmap first-level descriptors and find end */
> +	i = head;
> +
> +	while (vq->vring.desc[i].flags & nextflag) {
> +		vring_unmap_one_split(vq, &vq->vring.desc[i]);
> +		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
> +		vq->vq.num_free++;
> +	}
> +
> +	vring_unmap_one_split(vq, &vq->vring.desc[i]);
> +	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
> +	vq->free_head = head;
> +
> +	/* Plus final descriptor */
> +	vq->vq.num_free++;
> +
> +	if (vq->indirect) {
> +		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
> +		u32 len;
> +
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		if (!indir_desc)
> +			return;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
> +
> +		BUG_ON(!(vq->vring.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +			vring_unmap_one_split(vq, &indir_desc[j]);
> +
> +		kfree(indir_desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[head].indir_desc;
> +	}
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
> +{
> +	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> +}
> +
> +static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
> +					 unsigned int *len,
> +					 void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used_split(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used array entries after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> +
> +	if (unlikely(i >= vq->vring.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_split clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_split(vq, i, ctx);
> +	vq->last_used_idx++;
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> +		virtio_store_mb(vq->weak_barriers,
> +				&vring_used_event(&vq->vring),
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
> +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +}
> +
> +static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 last_used_idx;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always do both to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> +	END_USE(vq);
> +	return last_used_idx;
> +}
> +
> +static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	virtio_mb(vq->weak_barriers);
> +	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +}
> +
> +static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs;
> +
> +	START_USE(vq);
> +
> +	/* We optimistically turn back on interrupts, then check if there was
> +	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> +
> +	virtio_store_mb(vq->weak_barriers,
> +			&vring_used_event(&vq->vring),
> +			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> +
> +	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> +		END_USE(vq);
> +		return false;
> +	}
> +
> +	END_USE(vq);
> +	return true;
> +}
> +
> +static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	unsigned int i;
> +	void *buf;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (!vq->desc_state[i].data)
> +			continue;
> +		/* detach_buf clears data, so grab it now. */
> +		buf = vq->desc_state[i].data;
> +		detach_buf_split(vq, i, NULL);
> +		vq->avail_idx_shadow--;
> +		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> +		END_USE(vq);
> +		return buf;
> +	}
> +	/* That should have freed everything. */
> +	BUG_ON(vq->vq.num_free != vq->vring.num);
> +
> +	END_USE(vq);
> +	return NULL;
> +}
> +
> +/*
> + * The layout for the packed ring is a continuous chunk of memory
> + * which looks like this.
> + *
> + * struct vring_packed {
> + *	// The actual descriptors (16 bytes each)
> + *	struct vring_packed_desc desc[num];
> + *
> + *	// Padding to the next align boundary.
> + *	char pad[];
> + *
> + *	// Driver Event Suppression
> + *	struct vring_packed_desc_event driver;
> + *
> + *	// Device Event Suppression
> + *	struct vring_packed_desc_event device;
> + * };
> + */

Why not just allocate event structures separately?
Is it a win to have them share a cache line for some reason?

> +static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
> +				     void *p, unsigned long align)
> +{
> +	vr->num = num;
> +	vr->desc = p;
> +	vr->driver = (void *)ALIGN(((uintptr_t)p +
> +		sizeof(struct vring_packed_desc) * num), align);
> +	vr->device = vr->driver + 1;
> +}

What's all this about alignment? Where does it come from?

> +
> +static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
> +{
> +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> +}
> +
> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> +				       struct scatterlist *sgs[],
> +				       unsigned int total_sg,
> +				       unsigned int out_sgs,
> +				       unsigned int in_sgs,
> +				       void *data,
> +				       void *ctx,
> +				       gfp_t gfp)
> +{
> +	return -EIO;
> +}
> +
> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> +{
> +	return false;
> +}
> +
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	return false;
> +}
> +
> +static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> +					  unsigned int *len,
> +					  void **ctx)
> +{
> +	return NULL;
> +}
> +
> +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> +{
> +}
> +
> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> +{
> +	return 0;
> +}
> +
> +static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> +{
> +	return false;
> +}
> +
> +static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> +{
> +	return false;
> +}
> +
> +static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
> +{
> +	return NULL;
> +}
> +
> +static inline int virtqueue_add(struct virtqueue *_vq,
> +				struct scatterlist *sgs[],
> +				unsigned int total_sg,
> +				unsigned int out_sgs,
> +				unsigned int in_sgs,
> +				void *data,
> +				void *ctx,
> +				gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> +						 in_sgs, data, ctx, gfp) :
> +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> +						in_sgs, data, ctx, gfp);
> +}
> +
>  /**
>   * virtqueue_add_sgs - expose buffers to other end
>   * @vq: the struct virtqueue we're talking about.
> @@ -550,34 +943,9 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
>  bool virtqueue_kick_prepare(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 new, old;
> -	bool needs_kick;
>  
> -	START_USE(vq);
> -	/* We need to expose available array entries before checking avail
> -	 * event. */
> -	virtio_mb(vq->weak_barriers);
> -
> -	old = vq->avail_idx_shadow - vq->num_added;
> -	new = vq->avail_idx_shadow;
> -	vq->num_added = 0;
> -
> -#ifdef DEBUG
> -	if (vq->last_add_time_valid) {
> -		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> -					      vq->last_add_time)) > 100);
> -	}
> -	vq->last_add_time_valid = false;
> -#endif
> -
> -	if (vq->event) {
> -		needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
> -					      new, old);
> -	} else {
> -		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
> -	}
> -	END_USE(vq);
> -	return needs_kick;
> +	return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
> +			    virtqueue_kick_prepare_split(_vq);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
>  
> @@ -625,58 +993,9 @@ bool virtqueue_kick(struct virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_kick);
>  
> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> -		       void **ctx)
> -{
> -	unsigned int i, j;
> -	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> -
> -	/* Clear data ptr. */
> -	vq->desc_state[head].data = NULL;
> -
> -	/* Put back on free list: unmap first-level descriptors and find end */
> -	i = head;
> -
> -	while (vq->vring.desc[i].flags & nextflag) {
> -		vring_unmap_one(vq, &vq->vring.desc[i]);
> -		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
> -		vq->vq.num_free++;
> -	}
> -
> -	vring_unmap_one(vq, &vq->vring.desc[i]);
> -	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
> -	vq->free_head = head;
> -
> -	/* Plus final descriptor */
> -	vq->vq.num_free++;
> -
> -	if (vq->indirect) {
> -		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
> -		u32 len;
> -
> -		/* Free the indirect table, if any, now that it's unmapped. */
> -		if (!indir_desc)
> -			return;
> -
> -		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
> -
> -		BUG_ON(!(vq->vring.desc[head].flags &
> -			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> -		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> -
> -		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> -			vring_unmap_one(vq, &indir_desc[j]);
> -
> -		kfree(indir_desc);
> -		vq->desc_state[head].indir_desc = NULL;
> -	} else if (ctx) {
> -		*ctx = vq->desc_state[head].indir_desc;
> -	}
> -}
> -
>  static inline bool more_used(const struct vring_virtqueue *vq)
>  {
> -	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
>  }
>  
>  /**
> @@ -699,57 +1018,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
>  			    void **ctx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	void *ret;
> -	unsigned int i;
> -	u16 last_used;
>  
> -	START_USE(vq);
> -
> -	if (unlikely(vq->broken)) {
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	if (!more_used(vq)) {
> -		pr_debug("No more buffers in queue\n");
> -		END_USE(vq);
> -		return NULL;
> -	}
> -
> -	/* Only get used array entries after they have been exposed by host. */
> -	virtio_rmb(vq->weak_barriers);
> -
> -	last_used = (vq->last_used_idx & (vq->vring.num - 1));
> -	i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
> -	*len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
> -
> -	if (unlikely(i >= vq->vring.num)) {
> -		BAD_RING(vq, "id %u out of range\n", i);
> -		return NULL;
> -	}
> -	if (unlikely(!vq->desc_state[i].data)) {
> -		BAD_RING(vq, "id %u is not a head!\n", i);
> -		return NULL;
> -	}
> -
> -	/* detach_buf clears data, so grab it now. */
> -	ret = vq->desc_state[i].data;
> -	detach_buf(vq, i, ctx);
> -	vq->last_used_idx++;
> -	/* If we expect an interrupt for the next entry, tell host
> -	 * by writing event index and flush out the write before
> -	 * the read in the next get_buf call. */
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> -		virtio_store_mb(vq->weak_barriers,
> -				&vring_used_event(&vq->vring),
> -				cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> -
> -#ifdef DEBUG
> -	vq->last_add_time_valid = false;
> -#endif
> -
> -	END_USE(vq);
> -	return ret;
> +	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> +			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
>  
> @@ -771,12 +1042,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -
> +	if (vq->packed)
> +		virtqueue_disable_cb_packed(_vq);
> +	else
> +		virtqueue_disable_cb_split(_vq);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>  
> @@ -795,23 +1064,9 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 last_used_idx;
>  
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always do both to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> -	END_USE(vq);
> -	return last_used_idx;
> +	return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
> +			    virtqueue_enable_cb_prepare_split(_vq);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
>  
> @@ -828,8 +1083,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	virtio_mb(vq->weak_barriers);
> -	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> +	return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
> +			    virtqueue_poll_split(_vq, last_used_idx);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_poll);
>  
> @@ -867,34 +1122,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
>  bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 bufs;
>  
> -	START_USE(vq);
> -
> -	/* We optimistically turn back on interrupts, then check if there was
> -	 * more to do. */
> -	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> -	 * either clear the flags bit or point the event index at the next
> -	 * entry. Always update the event index to keep code simple. */
> -	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> -		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -	/* TODO: tune this threshold */
> -	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
> -
> -	virtio_store_mb(vq->weak_barriers,
> -			&vring_used_event(&vq->vring),
> -			cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
> -
> -	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> -		END_USE(vq);
> -		return false;
> -	}
> -
> -	END_USE(vq);
> -	return true;
> +	return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
> +			    virtqueue_enable_cb_delayed_split(_vq);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>  
> @@ -909,27 +1139,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
>  void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	unsigned int i;
> -	void *buf;
>  
> -	START_USE(vq);
> -
> -	for (i = 0; i < vq->vring.num; i++) {
> -		if (!vq->desc_state[i].data)
> -			continue;
> -		/* detach_buf clears data, so grab it now. */
> -		buf = vq->desc_state[i].data;
> -		detach_buf(vq, i, NULL);
> -		vq->avail_idx_shadow--;
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> -		END_USE(vq);
> -		return buf;
> -	}
> -	/* That should have freed everything. */
> -	BUG_ON(vq->vq.num_free != vq->vring.num);
> -
> -	END_USE(vq);
> -	return NULL;
> +	return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
> +			    virtqueue_detach_unused_buf_split(_vq);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
>  
> @@ -954,7 +1166,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>  EXPORT_SYMBOL_GPL(vring_interrupt);
>  
>  struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>  					struct virtio_device *vdev,
>  					bool weak_barriers,
>  					bool context,
> @@ -962,19 +1175,22 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  					void (*callback)(struct virtqueue *),
>  					const char *name)
>  {
> -	unsigned int i;
>  	struct vring_virtqueue *vq;
> +	unsigned int num, i;
> +	size_t size;
>  
> -	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> -		     GFP_KERNEL);
> +	num = packed ? vring.vring_packed.num : vring.vring_split.num;
> +	size = packed ? num * sizeof(struct vring_desc_state_packed) :
> +			num * sizeof(struct vring_desc_state);
> +
> +	vq = kmalloc(sizeof(*vq) + size, GFP_KERNEL);
>  	if (!vq)
>  		return NULL;
>  
> -	vq->vring = vring;
>  	vq->vq.callback = callback;
>  	vq->vq.vdev = vdev;
>  	vq->vq.name = name;
> -	vq->vq.num_free = vring.num;
> +	vq->vq.num_free = num;
>  	vq->vq.index = index;
>  	vq->we_own_ring = false;
>  	vq->queue_dma_addr = 0;
> @@ -983,9 +1199,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->weak_barriers = weak_barriers;
>  	vq->broken = false;
>  	vq->last_used_idx = 0;
> -	vq->avail_flags_shadow = 0;
> -	vq->avail_idx_shadow = 0;
>  	vq->num_added = 0;
> +	vq->packed = packed;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
>  	vq->in_use = false;
> @@ -996,19 +1211,48 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  		!context;
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
> +	if (vq->packed) {
> +		vq->vring_packed = vring.vring_packed;
> +		vq->next_avail_idx = 0;
> +		vq->avail_wrap_counter = 1;
> +		vq->used_wrap_counter = 1;
> +		vq->event_flags_shadow = 0;
> +
> +		memset(vq->desc_state_packed, 0,
> +			num * sizeof(struct vring_desc_state_packed));
> +
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +		for (i = 0; i < num-1; i++)
> +			vq->desc_state_packed[i].next = i + 1;
> +	} else {
> +		vq->vring = vring.vring_split;
> +		vq->avail_flags_shadow = 0;
> +		vq->avail_idx_shadow = 0;
> +
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +		for (i = 0; i < num-1; i++)
> +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> +
> +		memset(vq->desc_state, 0,
> +			num * sizeof(struct vring_desc_state));
> +	}
> +
>  	/* No callback?  Tell other side not to bother us. */
>  	if (!callback) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +		if (packed) {
> +			vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> +			vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
> +						vq->event_flags_shadow);
> +		} else {
> +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +			if (!vq->event)
> +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
> +						vq->avail_flags_shadow);
> +		}
>  	}
>  
> -	/* Put everything in free lists. */
> -	vq->free_head = 0;
> -	for (i = 0; i < vring.num-1; i++)
> -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> -
>  	return &vq->vq;
>  }
>  EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
> @@ -1055,6 +1299,12 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>  	}
>  }
>  
> +static inline int
> +__vring_size(unsigned int num, unsigned long align, bool packed)
> +{
> +	return packed ? vring_size_packed(num, align) : vring_size(num, align);
> +}
> +
>  struct virtqueue *vring_create_virtqueue(
>  	unsigned int index,
>  	unsigned int num,
> @@ -1071,7 +1321,8 @@ struct virtqueue *vring_create_virtqueue(
>  	void *queue = NULL;
>  	dma_addr_t dma_addr;
>  	size_t queue_size_in_bytes;
> -	struct vring vring;
> +	union vring_union vring;
> +	bool packed;
>  
>  	/* We assume num is a power of 2. */
>  	if (num & (num - 1)) {
> @@ -1079,9 +1330,13 @@ struct virtqueue *vring_create_virtqueue(
>  		return NULL;
>  	}
>  
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +
>  	/* TODO: allocate each queue chunk individually */
> -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> +			num /= 2) {
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>  					  &dma_addr,
>  					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>  		if (queue)
> @@ -1093,17 +1348,21 @@ struct virtqueue *vring_create_virtqueue(
>  
>  	if (!queue) {
>  		/* Try to get a single page. You are my only hope! */
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>  					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>  	}
>  	if (!queue)
>  		return NULL;
>  
> -	queue_size_in_bytes = vring_size(num, vring_align);
> -	vring_init(&vring, num, queue, vring_align);
> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> +	if (packed)
> +		vring_init_packed(&vring.vring_packed, num, queue, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, queue, vring_align);
>  
> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				   notify, callback, name);
> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				   context, notify, callback, name);
>  	if (!vq) {
>  		vring_free_queue(vdev, queue_size_in_bytes, queue,
>  				 dma_addr);
> @@ -1129,10 +1388,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  				      void (*callback)(struct virtqueue *vq),
>  				      const char *name)
>  {
> -	struct vring vring;
> -	vring_init(&vring, num, pages, vring_align);
> -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				     notify, callback, name);
> +	union vring_union vring;
> +	bool packed;
> +
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +	if (packed)
> +		vring_init_packed(&vring.vring_packed, num, pages, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, pages, vring_align);


vring_init in the UAPI header is more or less a bug.
I'd just stop using it, keep it around for legacy userspace.

> +
> +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				     context, notify, callback, name);
>  }
>  EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>  
> @@ -1142,7 +1408,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>  
>  	if (vq->we_own_ring) {
>  		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> -				 vq->vring.desc, vq->queue_dma_addr);
> +				 vq->packed ? (void *)vq->vring_packed.desc :
> +					      (void *)vq->vring.desc,
> +				 vq->queue_dma_addr);
>  	}
>  	list_del(&_vq->list);
>  	kfree(vq);
> @@ -1184,7 +1452,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>  
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	return vq->vring.num;
> +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>  
> @@ -1227,6 +1495,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>  
>  	BUG_ON(!vq->we_own_ring);
>  
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> +				(char *)vq->vring_packed.desc);
> +
>  	return vq->queue_dma_addr +
>  		((char *)vq->vring.avail - (char *)vq->vring.desc);
>  }
> @@ -1238,11 +1510,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>  
>  	BUG_ON(!vq->we_own_ring);
>  
> +	if (vq->packed)
> +		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> +				(char *)vq->vring_packed.desc);
> +
>  	return vq->queue_dma_addr +
>  		((char *)vq->vring.used - (char *)vq->vring.desc);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>  
> +/* Only available for split ring */
>  const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>  {
>  	return &to_vvq(vq)->vring;
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index fab02133a919..992142b35f55 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>  struct virtio_device;
>  struct virtqueue;
>  
> +union vring_union {
> +	struct vring vring_split;
> +	struct vring_packed vring_packed;
> +};
> +
>  /*
>   * Creates a virtqueue and allocates the descriptor ring.  If
>   * may_reduce_num is set, then this may allocate a smaller ring than
> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>  
>  /* Creates a virtqueue with a custom layout. */
>  struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>  					struct virtio_device *vdev,
>  					bool weak_barriers,
>  					bool ctx,
> -- 
> 2.18.0

^ permalink raw reply

* Re: [PATCH net-next v2 4/5] virtio_ring: add event idx support in packed ring
From: Michael S. Tsirkin @ 2018-09-07 14:10 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: virtio-dev, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180711022711.7090-5-tiwei.bie@intel.com>

On Wed, Jul 11, 2018 at 10:27:10AM +0800, Tiwei Bie wrote:
> This commit introduces the EVENT_IDX support in packed ring.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>

Besides the usual comment about hard-coded constants like <<15:
does this actually do any good for performance? We don't
have to if we do not want to.

> ---
>  drivers/virtio/virtio_ring.c | 73 ++++++++++++++++++++++++++++++++----
>  1 file changed, 65 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index f317b485ba54..f79a1e17f7d1 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1050,7 +1050,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>  static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 flags;
> +	u16 new, old, off_wrap, flags, wrap_counter, event_idx;
>  	bool needs_kick;
>  	u32 snapshot;
>  
> @@ -1059,9 +1059,19 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>  	 * suppressions. */
>  	virtio_mb(vq->weak_barriers);
>  
> +	old = vq->next_avail_idx - vq->num_added;
> +	new = vq->next_avail_idx;
> +	vq->num_added = 0;
> +
>  	snapshot = READ_ONCE(*(u32 *)vq->vring_packed.device);
> +	off_wrap = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot & 0xffff));
>  	flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;


some kind of struct union would be helpful to make this readable.

>  
> +	wrap_counter = off_wrap >> 15;
> +	event_idx = off_wrap & ~(1 << 15);
> +	if (wrap_counter != vq->avail_wrap_counter)
> +		event_idx -= vq->vring_packed.num;
> +
>  #ifdef DEBUG
>  	if (vq->last_add_time_valid) {
>  		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> @@ -1070,7 +1080,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>  	vq->last_add_time_valid = false;
>  #endif
>  
> -	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	if (flags == VRING_EVENT_F_DESC)
> +		needs_kick = vring_need_event(event_idx, new, old);
> +	else
> +		needs_kick = (flags != VRING_EVENT_F_DISABLE);
>  	END_USE(vq);
>  	return needs_kick;
>  }
> @@ -1185,6 +1198,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>  	ret = vq->desc_state_packed[id].data;
>  	detach_buf_packed(vq, id, ctx);
>  
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> +		virtio_store_mb(vq->weak_barriers,
> +				&vq->vring_packed.driver->off_wrap,
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> +					((u16)vq->used_wrap_counter << 15)));
> +
>  #ifdef DEBUG
>  	vq->last_add_time_valid = false;
>  #endif
> @@ -1213,8 +1235,18 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>  	/* We optimistically turn back on interrupts, then check if there was
>  	 * more to do. */
>  
> +	if (vq->event) {
> +		vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +				vq->last_used_idx |
> +				((u16)vq->used_wrap_counter << 15));
> +		/* We need to update event offset and event wrap
> +		 * counter first before updating event flags. */
> +		virtio_wmb(vq->weak_barriers);
> +	}
> +
>  	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>  		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>  							vq->event_flags_shadow);
>  	}
> @@ -1238,22 +1270,47 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned off_wrap)
>  static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs, used_idx, wrap_counter;
>  
>  	START_USE(vq);
>  
>  	/* We optimistically turn back on interrupts, then check if there was
>  	 * more to do. */
>  
> +	if (vq->event) {
> +		/* TODO: tune this threshold */
> +		bufs = (vq->vring_packed.num - _vq->num_free) * 3 / 4;
> +		wrap_counter = vq->used_wrap_counter;
> +
> +		used_idx = vq->last_used_idx + bufs;
> +		if (used_idx >= vq->vring_packed.num) {
> +			used_idx -= vq->vring_packed.num;
> +			wrap_counter ^= 1;
> +		}
> +
> +		vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +				used_idx | (wrap_counter << 15));
> +
> +		/* We need to update event offset and event wrap
> +		 * counter first before updating event flags. */
> +		virtio_wmb(vq->weak_barriers);
> +	} else {
> +		used_idx = vq->last_used_idx;
> +		wrap_counter = vq->used_wrap_counter;
> +	}
> +
>  	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>  		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>  							vq->event_flags_shadow);
> -		/* We need to enable interrupts first before re-checking
> -		 * for more used buffers. */
> -		virtio_mb(vq->weak_barriers);
>  	}
>  
> -	if (more_used_packed(vq)) {
> +	/* We need to update event suppression structure first
> +	 * before re-checking for more used buffers. */
> +	virtio_mb(vq->weak_barriers);
> +

mb is expensive. We should not do it if we changed nothing.

> +	if (is_used_desc_packed(vq, used_idx, wrap_counter)) {
>  		END_USE(vq);
>  		return false;
>  	}
> -- 
> 2.18.0

^ permalink raw reply

* Re: [PATCH net-next 06/11] tuntap: split out XDP logic
From: Michael S. Tsirkin @ 2018-09-07 14:16 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <84ae17c1-f34b-610d-a5a1-ba8dce1732f3@redhat.com>

On Fri, Sep 07, 2018 at 11:29:34AM +0800, Jason Wang wrote:
> > > +		if (act != XDP_PASS)
> > > +			goto out;
> > likely?
> 
> It depends on the XDP program, so I tend not to use it.

Point is XDP_PASS is already slow.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next 04/11] tuntap: simplify error handling in tun_build_skb()
From: Michael S. Tsirkin @ 2018-09-07 14:17 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <e1f17f51-5f99-df18-4185-6c6e4dfaba89@redhat.com>

On Fri, Sep 07, 2018 at 11:22:00AM +0800, Jason Wang wrote:
> > > @@ -1668,6 +1668,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> > >   	if (copied != len)
> > >   		return ERR_PTR(-EFAULT);
> > > +	get_page(alloc_frag->page);
> > > +	alloc_frag->offset += buflen;
> > > +
> > This adds an atomic op on XDP_DROP which is a data path
> > operation for some workloads.
> 
> Yes, I have patch on top to amortize this, the idea is to have a very big
> refcount once after the frag was allocated and maintain a bias and decrease
> them all when allocating new frags.'

Why bother with refcounting for a drop though? It should be simple.

-- 
MST

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox