All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: xiaohui.xin@intel.com
Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu,
	jdike@c2.user-mode-linux.org, Zhao Yu <yzhao81@gmail.com>
Subject: Re: [PATCH v1 3/3] Let host NIC driver to DMA to guest user space.
Date: Mon, 8 Mar 2010 13:18:26 +0200	[thread overview]
Message-ID: <20100308111826.GH7482@redhat.com> (raw)
In-Reply-To: <1267868318-19268-4-git-send-email-xiaohui.xin@intel.com>

On Sat, Mar 06, 2010 at 05:38:38PM +0800, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> The patch let host NIC driver to receive user space skb,
> then the driver has chance to directly DMA to guest user
> space buffers thru single ethX interface.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81@gmail.com>
> Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>


I have a feeling I commented on some of the below issues already.
Do you plan to send a version with comments addressed?

> ---
>  include/linux/netdevice.h |   76 ++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/skbuff.h    |   30 +++++++++++++++--
>  net/core/dev.c            |   32 ++++++++++++++++++
>  net/core/skbuff.c         |   79 +++++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 205 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 94958c1..97bf12c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -485,6 +485,17 @@ struct netdev_queue {
>  	unsigned long		tx_dropped;
>  } ____cacheline_aligned_in_smp;
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct mpassthru_port	{
> +	int		hdr_len;
> +	int		data_len;
> +	int		npages;
> +	unsigned	flags;
> +	struct socket	*sock;
> +	struct skb_user_page	*(*ctor)(struct mpassthru_port *,
> +				struct sk_buff *, int);
> +};
> +#endif
>  
>  /*
>   * This structure defines the management hooks for network devices.
> @@ -636,6 +647,10 @@ struct net_device_ops {
>  	int			(*ndo_fcoe_ddp_done)(struct net_device *dev,
>  						     u16 xid);
>  #endif
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	int			(*ndo_mp_port_prep)(struct net_device *dev,
> +						struct mpassthru_port *port);
> +#endif
>  };
>  
>  /*
> @@ -891,7 +906,8 @@ struct net_device
>  	struct macvlan_port	*macvlan_port;
>  	/* GARP */
>  	struct garp_port	*garp_port;
> -
> +	/* mpassthru */
> +	struct mpassthru_port	*mp_port;
>  	/* class/net/name entry */
>  	struct device		dev;
>  	/* space for optional statistics and wireless sysfs groups */
> @@ -2013,6 +2029,62 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
>  		return 0;
>  	return dev->ethtool_ops->get_flags(dev);
>  }
> -#endif /* __KERNEL__ */
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline int netdev_mp_port_prep(struct net_device *dev,
> +		struct mpassthru_port *port)
> +{

This function lacks documentation.

> +	int rc;
> +	int npages, data_len;
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +
> +	/* needed by packet split */
> +	if (ops->ndo_mp_port_prep) {
> +		rc = ops->ndo_mp_port_prep(dev, port);
> +		if (rc)
> +			return rc;
> +	} else {  /* should be temp */
> +		port->hdr_len = 128;
> +		port->data_len = 2048;
> +		port->npages = 1;

where do the numbers come from?

> +	}
> +
> +	if (port->hdr_len <= 0)
> +		goto err;
> +
> +	npages = port->npages;
> +	data_len = port->data_len;
> +	if (npages <= 0 || npages > MAX_SKB_FRAGS ||
> +			(data_len < PAGE_SIZE * (npages - 1) ||
> +			 data_len > PAGE_SIZE * npages))
> +		goto err;
> +
> +	return 0;
> +err:
> +	dev_warn(&dev->dev, "invalid page constructor parameters\n");
> +
> +	return -EINVAL;
> +}
> +
> +static inline int netdev_mp_port_attach(struct net_device *dev,
> +		struct mpassthru_port *port)
> +{
> +	if (rcu_dereference(dev->mp_port))
> +		return -EBUSY;
> +
> +	rcu_assign_pointer(dev->mp_port, port);
> +
> +	return 0;
> +}
> +
> +static inline void netdev_mp_port_detach(struct net_device *dev)
> +{
> +	if (!rcu_dereference(dev->mp_port))
> +		return;
> +
> +	rcu_assign_pointer(dev->mp_port, NULL);
> +	synchronize_rcu();
> +}

The above looks wrong, rcu_dereference should be called
under rcu read side, rcu_assign_pointer usually should not,
synchronize_rcu definitely should not.

As I suggested already, these functions are better opencoded,
rcu is tricky as is without hiding it in inline helpers.

> +#endif /* CONFIG_VHOST_PASSTHRU */
> +#endif /* __KERNEL__ */
>  #endif	/* _LINUX_NETDEVICE_H */
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index df7b23a..e59fa57 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -209,6 +209,13 @@ struct skb_shared_info {
>  	void *		destructor_arg;
>  };
>  
> +struct skb_user_page {
> +	u8              *start;
> +	int             size;
> +	struct skb_frag_struct *frags;
> +	struct skb_shared_info *ushinfo;
> +	void		(*dtor)(struct skb_user_page *);
> +};
>  /* We divide dataref into two halves.  The higher 16 bits hold references
>   * to the payload part of skb->data.  The lower 16 bits hold references to
>   * the entire skb->data.  A clone of a headerless skb holds the length of
> @@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
>  extern void consume_skb(struct sk_buff *skb);
>  extern void	       __kfree_skb(struct sk_buff *skb);
>  extern struct sk_buff *__alloc_skb(unsigned int size,
> -				   gfp_t priority, int fclone, int node);
> +				   gfp_t priority, int fclone,
> +				   int node, struct net_device *dev);
>  static inline struct sk_buff *alloc_skb(unsigned int size,
>  					gfp_t priority)
>  {
> -	return __alloc_skb(size, priority, 0, -1);
> +	return __alloc_skb(size, priority, 0, -1, NULL);
>  }
>  
>  static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
>  					       gfp_t priority)
>  {
> -	return __alloc_skb(size, priority, 1, -1);
> +	return __alloc_skb(size, priority, 1, -1, NULL);
>  }
>  
>  extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
> @@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
>  	__free_page(page);
>  }
>  
> +extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> +			struct sk_buff *skb, int npages);
> +
> +static inline struct skb_user_page *netdev_alloc_user_page(
> +		struct net_device *dev,
> +		struct sk_buff *skb, unsigned int size)
> +{
> +	struct skb_user_page *user;
> +	int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);

Should round up to full pages?

> +
> +	user = netdev_alloc_user_pages(dev, skb, npages);
> +	if (likely(user))
> +		return user;
> +	return NULL;
> +}
> +
>  /**
>   *	skb_clone_writable - is the header of a clone writable
>   *	@skb: buffer to check
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b8f74cf..ab8b082 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2265,6 +2265,30 @@ void netif_nit_deliver(struct sk_buff *skb)
>  	rcu_read_unlock();
>  }
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
> +					struct packet_type **pt_prev,
> +					int *ret, struct net_device *orig_dev)

please document

pt_prev, orig_dev and ret seem unused?

> +{
> +	struct mpassthru_port *ctor = NULL;

Why do you call the port "ctor"?

> +	struct sock *sk = NULL;
> +
> +	if (skb->dev)
> +		ctor = skb->dev->mp_port;
> +	if (!ctor)
> +		return skb;
> +
> +	sk = ctor->sock->sk;
> +
> +	skb_queue_tail(&sk->sk_receive_queue, skb);
> +
> +	sk->sk_data_ready(sk, skb->len);
> +	return NULL;
> +}
> +#else
> +#define handle_mpassthru(skb, pt_prev, ret, orig_dev)      (skb)
> +#endif
> +
>  /**
>   *	netif_receive_skb - process receive buffer from network
>   *	@skb: buffer to process
> @@ -2342,6 +2366,9 @@ int netif_receive_skb(struct sk_buff *skb)
>  		goto out;
>  ncls:
>  #endif
> +	skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
> +	if (!skb)
> +		goto out;
>  
>  	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>  	if (!skb)
> @@ -2455,6 +2482,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
>  	if (skb_is_gso(skb) || skb_has_frags(skb))
>  		goto normal;
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	if (skb->dev && skb->dev->mp_port)
> +		goto normal;
> +#endif
> +
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(ptype, head, list) {
>  		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 80a9616..6510e5b 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic);
>   *	%GFP_ATOMIC.
>   */
>  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> -			    int fclone, int node)
> +			    int fclone, int node, struct net_device *dev)
>  {
>  	struct kmem_cache *cache;
>  	struct skb_shared_info *shinfo;
>  	struct sk_buff *skb;
>  	u8 *data;
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	struct skb_user_page *user = NULL;
> +#endif
>  	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>  
>  	/* Get the HEAD */
> @@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>  		goto out;
>  
>  	size = SKB_DATA_ALIGN(size);
> -	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
> -			gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	if (!dev || !dev->mp_port) { /* Legacy alloc func */
> +#endif
> +		data = kmalloc_node_track_caller(
> +				size + sizeof(struct skb_shared_info),
> +				gfp_mask, node);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	} else { /* Allocation may from page constructor of device */

what does the comment mean?

> +		user = netdev_alloc_user_page(dev, skb, size);
> +		if (!user) {
> +			data = kmalloc_node_track_caller(
> +				size + sizeof(struct skb_shared_info),
> +				gfp_mask, node);
> +			printk(KERN_INFO "can't alloc user buffer.\n");
> +		} else {
> +			data = user->start;
> +			size = SKB_DATA_ALIGN(user->size);
> +		}
> +	}
> +#endif
>  	if (!data)
>  		goto nodata;
>  
> @@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>  	skb->mac_header = ~0U;
>  #endif
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	if (user)
> +		memcpy(user->ushinfo, skb_shinfo(skb),
> +				sizeof(struct skb_shared_info));
> +#endif
>  	/* make sure we initialize shinfo sequentially */
>  	shinfo = skb_shinfo(skb);
>  	atomic_set(&shinfo->dataref, 1);
> @@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>  
>  		child->fclone = SKB_FCLONE_UNAVAILABLE;
>  	}
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	shinfo->destructor_arg = user;
> +#endif
> +
>  out:
>  	return skb;
>  nodata:
> @@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
>  	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
>  	struct sk_buff *skb;
>  
> -	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
> +	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
>  	if (likely(skb)) {
>  		skb_reserve(skb, NET_SKB_PAD);
>  		skb->dev = dev;
> @@ -278,6 +307,29 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL(__netdev_alloc_page);
>  
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
> +			struct sk_buff *skb, int npages)
> +{
> +	struct mpassthru_port *ctor;
> +	struct skb_user_page *user = NULL;
> +
> +	rcu_read_lock();
> +	ctor = rcu_dereference(dev->mp_port);
> +	if (!ctor)
> +		goto out;
> +
> +	BUG_ON(npages > ctor->npages);
> +
> +	user = ctor->ctor(ctor, skb, npages);

With the assumption that "ctor" pins userspace pages,
can't it sleep? If yes you can't call it under rcu read side
critical section.

> +out:
> +	rcu_read_unlock();
> +
> +	return user;
> +}
> +EXPORT_SYMBOL(netdev_alloc_user_pages);
> +#endif
> +
>  void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
>  		int size)
>  {
> @@ -338,6 +390,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)
>  
>  static void skb_release_data(struct sk_buff *skb)
>  {
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
> +#endif
> +
>  	if (!skb->cloned ||
>  	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
>  			       &skb_shinfo(skb)->dataref)) {
> @@ -349,7 +405,10 @@ static void skb_release_data(struct sk_buff *skb)
>  
>  		if (skb_has_frags(skb))
>  			skb_drop_fraglist(skb);
> -
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +		if (skb->dev && skb->dev->mp_port && user && user->dtor)
> +			user->dtor(user);
> +#endif
>  		kfree(skb->head);
>  	}
>  }
> @@ -503,8 +562,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
>  	if (skb_shared(skb) || skb_cloned(skb))
>  		return 0;
>  
> -	skb_release_head_state(skb);
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +	if (skb->dev && skb->dev->mp_port)
> +		return 0;
> +#endif
> +
>  	shinfo = skb_shinfo(skb);
> +
> +	skb_release_head_state(skb);
>  	atomic_set(&shinfo->dataref, 1);
>  	shinfo->nr_frags = 0;
>  	shinfo->gso_size = 0;
> -- 
> 1.5.4.4

  parent reply	other threads:[~2010-03-08 11:22 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-06  9:38 [PATCH v1 0/3] Provide a zero-copy method on KVM virtio-net xiaohui.xin
2010-03-06  9:38 ` [PATCH v1 1/3] A device for zero-copy based " xiaohui.xin
2010-03-06  9:38   ` [PATCH v1 2/3] Provides multiple submits and asynchronous notifications xiaohui.xin
2010-03-06  9:38     ` [PATCH v1 3/3] Let host NIC driver to DMA to guest user space xiaohui.xin
2010-03-06 17:18       ` Stephen Hemminger
2010-03-08 11:18       ` Michael S. Tsirkin [this message]
2010-03-07 11:18     ` [PATCH v1 2/3] Provides multiple submits and asynchronous notifications Michael S. Tsirkin
2010-03-15  8:46       ` Xin, Xiaohui
2010-03-15  9:23         ` Michael S. Tsirkin
2010-03-16  9:32           ` Xin Xiaohui
2010-03-16 11:33             ` [PATCH " Michael S. Tsirkin
2010-03-17  9:48               ` Xin, Xiaohui
2010-03-17 10:27                 ` Michael S. Tsirkin
2010-04-01  9:14                   ` Xin Xiaohui
2010-04-01 11:02                     ` [PATCH " Michael S. Tsirkin
2010-04-02  2:16                       ` Xin, Xiaohui
2010-04-04 11:40                         ` Michael S. Tsirkin
2010-04-06  5:46                           ` Xin, Xiaohui
2010-04-06  7:51                             ` Michael S. Tsirkin
2010-04-07  1:36                               ` Xin, Xiaohui
2010-04-07  8:18                                 ` Michael S. Tsirkin
2010-04-08  9:07                                   ` xiaohui.xin
2010-03-08 11:28   ` [PATCH v1 1/3] A device for zero-copy based on KVM virtio-net Michael S. Tsirkin
2010-04-01  9:27     ` Xin Xiaohui
2010-04-01 11:08       ` [PATCH " Michael S. Tsirkin
2010-04-06  5:41         ` Xin, Xiaohui
2010-04-06  7:49           ` Michael S. Tsirkin
2010-04-07  2:41         ` Xin, Xiaohui
2010-04-07  8:15           ` Michael S. Tsirkin
2010-04-07  9:00             ` xiaohui.xin
2010-04-07 11:17               ` [PATCH " Michael S. Tsirkin
2010-03-07 10:50 ` [PATCH v1 0/3] Provide a zero-copy method " Michael S. Tsirkin
2010-03-09  7:47   ` Xin, Xiaohui

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100308111826.GH7482@redhat.com \
    --to=mst@redhat.com \
    --cc=jdike@c2.user-mode-linux.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=netdev@vger.kernel.org \
    --cc=xiaohui.xin@intel.com \
    --cc=yzhao81@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.