Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC] [ver3 PATCH 6/6] virtio_net: Convert virtio_net driver to use find_vqs_irq
From: Krishna Kumar @ 2011-11-11 13:07 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, kvm, davem, Krishna Kumar, virtualization
In-Reply-To: <20111111130223.9878.59517.sendpatchset@krkumar2.in.ibm.com>

Convert virtio_net driver to use find_vqs_irq(). The TX vq's
share a single irq, while the RX vq's have individual irq's.
The skb_xmit_done handler also checks if any work is required.

Signed-off-by: krkumar2@in.ibm.com
---
 drivers/net/virtio_net.c |   29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c	2011-11-11 16:45:17.000000000 +0530
+++ new/drivers/net/virtio_net.c	2011-11-11 16:48:45.000000000 +0530
@@ -163,11 +163,13 @@ static void skb_xmit_done(struct virtque
 	struct virtnet_info *vi = vq->vdev->priv;
 	int qnum = vq->queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
-	/* Suppress further interrupts. */
-	virtqueue_disable_cb(vq);
+	if (__netif_subqueue_stopped(vi->dev, qnum)) {
+		/* Suppress further interrupts. */
+		virtqueue_disable_cb(vq);
 
-	/* We were probably waiting for more output buffers. */
-	netif_wake_subqueue(vi->dev, qnum);
+		/* We were probably waiting for more output buffers. */
+		netif_wake_subqueue(vi->dev, qnum);
+	}
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -1120,6 +1122,7 @@ static void setup_cvq(struct virtnet_inf
 
 static int invoke_find_vqs(struct virtnet_info *vi)
 {
+	unsigned long *flags = NULL;
 	vq_callback_t **callbacks;
 	struct virtqueue **vqs;
 	int ret = -ENOMEM;
@@ -1141,6 +1144,14 @@ static int invoke_find_vqs(struct virtne
 	if (!vqs || !callbacks || !names)
 		goto err;
 
+	if (vi->num_queue_pairs > 1) {
+		int num = (total_vqs + BITS_PER_LONG - 1) / BITS_PER_LONG;
+
+		flags = kzalloc(num * sizeof(*flags), GFP_KERNEL);
+		if (!flags)
+			goto err;
+	}
+
 	/* Allocate/initialize parameters for recv virtqueues */
 	for (i = 0; i < vi->num_queue_pairs * 2; i += 2) {
 		callbacks[i] = skb_recv_done;
@@ -1155,6 +1166,8 @@ static int invoke_find_vqs(struct virtne
 		names[i] = kasprintf(GFP_KERNEL, "output.%d", i / 2);
 		if (!names[i])
 			goto err;
+		if (flags)
+			set_bit(i, flags);
 	}
 
 	/* Parameters for control virtqueue, if any */
@@ -1163,9 +1176,9 @@ static int invoke_find_vqs(struct virtne
 		names[i - 1] = "control";
 	}
 
-	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 (const char **)names);
-
+	ret = vi->vdev->config->find_vqs_irq(vi->vdev, total_vqs, vqs,
+					     callbacks, (const char **)names,
+					     flags);
 	if (ret)
 		goto err;
 
@@ -1174,6 +1187,8 @@ static int invoke_find_vqs(struct virtne
 	setup_cvq(vi, vqs, vi->num_queue_pairs * 2);
 
 err:
+	kfree(flags);
+
 	if (ret && names)
 		for (i = 0; i < vi->num_queue_pairs * 2; i++)
 			kfree(names[i]);

^ permalink raw reply

* Re: [PATCH 4/4] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
From: Ian Campbell @ 2011-11-11 13:20 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David S. Miller,
	Neil Brown, J. Bruce Fields,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20111111123824.GA23902-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Fri, 2011-11-11 at 12:38 +0000, Michael S. Tsirkin wrote:
> On Wed, Nov 09, 2011 at 03:02:07PM +0000, Ian Campbell wrote:
> > This prevents an issue where an ACK is delayed, a retransmit is queued (either
> > at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> > wire. If this happens to an NFS WRITE RPC then the write() system call
> > completes and the userspace process can continue, potentially modifying data
> > referenced by the retransmission before the retransmission occurs.
> > 
> > Signed-off-by: Ian Campbell <ian.campbell-Sxgqhf6Nn4DQT0dZR+AlfA@public.gmane.org>
> > Acked-by: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
> > Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> > Cc: Neil Brown <neilb-l3A5Bk7waGM@public.gmane.org>
> > Cc: "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
> > Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> 
> So this blocks the system call until all page references
> are gone, right?

Right. The alternative is to return to userspace while the network stack
still has a reference to the buffer which was passed in -- that's the
exact class of problem this patch is supposed to fix.

> But, there's no upper limit on how long the
> page is referenced, correct?

Correct.

>  consider a bridged setup
> with an skb queued at a tap device - this cause one process
> to block another one by virtue of not consuming a cloned skb?

Hmm, yes.

One approach might be to introduce the concept of an skb timeout to the
stack as a whole and cancel (or deep copy) after that timeout occurs.
That's going to be tricky though I suspect...

A simpler option would be to have an end points such as a tap device
which can swallow skbs for arbitrary times implement a policy in this
regard, either to deep copy or drop after a timeout?

Ian.

> 
> > ---
> >  include/linux/sunrpc/xdr.h  |    2 ++
> >  include/linux/sunrpc/xprt.h |    5 ++++-
> >  net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
> >  net/sunrpc/svcsock.c        |    3 ++-
> >  net/sunrpc/xprt.c           |   13 +++++++++++++
> >  net/sunrpc/xprtsock.c       |    3 ++-
> >  6 files changed, 45 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
> > index a20970e..172f81e 100644
> > --- a/include/linux/sunrpc/xdr.h
> > +++ b/include/linux/sunrpc/xdr.h
> > @@ -16,6 +16,7 @@
> >  #include <asm/byteorder.h>
> >  #include <asm/unaligned.h>
> >  #include <linux/scatterlist.h>
> > +#include <linux/skbuff.h>
> >  
> >  /*
> >   * Buffer adjustment
> > @@ -57,6 +58,7 @@ struct xdr_buf {
> >  			tail[1];	/* Appended after page data */
> >  
> >  	struct page **	pages;		/* Array of contiguous pages */
> > +	struct skb_frag_destructor *destructor;
> >  	unsigned int	page_base,	/* Start of page data */
> >  			page_len,	/* Length of page data */
> >  			flags;		/* Flags for data disposition */
> > diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> > index 15518a1..75131eb 100644
> > --- a/include/linux/sunrpc/xprt.h
> > +++ b/include/linux/sunrpc/xprt.h
> > @@ -92,7 +92,10 @@ struct rpc_rqst {
> >  						/* A cookie used to track the
> >  						   state of the transport
> >  						   connection */
> > -	
> > +	struct skb_frag_destructor destructor;	/* SKB paged fragment
> > +						 * destructor for
> > +						 * transmitted pages*/
> > +
> >  	/*
> >  	 * Partial send handling
> >  	 */
> > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> > index c5347d2..919538d 100644
> > --- a/net/sunrpc/clnt.c
> > +++ b/net/sunrpc/clnt.c
> > @@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
> >  static void	call_reserveresult(struct rpc_task *task);
> >  static void	call_allocate(struct rpc_task *task);
> >  static void	call_decode(struct rpc_task *task);
> > +static void	call_complete(struct rpc_task *task);
> >  static void	call_bind(struct rpc_task *task);
> >  static void	call_bind_status(struct rpc_task *task);
> >  static void	call_transmit(struct rpc_task *task);
> > @@ -1113,6 +1114,8 @@ rpc_xdr_encode(struct rpc_task *task)
> >  			 (char *)req->rq_buffer + req->rq_callsize,
> >  			 req->rq_rcvsize);
> >  
> > +	req->rq_snd_buf.destructor = &req->destructor;
> > +
> >  	p = rpc_encode_header(task);
> >  	if (p == NULL) {
> >  		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
> > @@ -1276,6 +1279,7 @@ call_connect_status(struct rpc_task *task)
> >  static void
> >  call_transmit(struct rpc_task *task)
> >  {
> > +	struct rpc_rqst *req = task->tk_rqstp;
> >  	dprint_status(task);
> >  
> >  	task->tk_action = call_status;
> > @@ -1309,8 +1313,8 @@ call_transmit(struct rpc_task *task)
> >  	call_transmit_status(task);
> >  	if (rpc_reply_expected(task))
> >  		return;
> > -	task->tk_action = rpc_exit_task;
> > -	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
> > +	task->tk_action = call_complete;
> > +	skb_frag_destructor_unref(&req->destructor);
> >  }
> >  
> >  /*
> > @@ -1383,7 +1387,8 @@ call_bc_transmit(struct rpc_task *task)
> >  		return;
> >  	}
> >  
> > -	task->tk_action = rpc_exit_task;
> > +	task->tk_action = call_complete;
> > +	skb_frag_destructor_unref(&req->destructor);
> >  	if (task->tk_status < 0) {
> >  		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
> >  			"error: %d\n", task->tk_status);
> > @@ -1423,7 +1428,6 @@ call_bc_transmit(struct rpc_task *task)
> >  			"error: %d\n", task->tk_status);
> >  		break;
> >  	}
> > -	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
> >  }
> >  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
> >  
> > @@ -1589,12 +1593,14 @@ call_decode(struct rpc_task *task)
> >  		return;
> >  	}
> >  
> > -	task->tk_action = rpc_exit_task;
> > +	task->tk_action = call_complete;
> >  
> >  	if (decode) {
> >  		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
> >  						      task->tk_msg.rpc_resp);
> >  	}
> > +	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
> > +	skb_frag_destructor_unref(&req->destructor);
> >  	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
> >  			task->tk_status);
> >  	return;
> > @@ -1609,6 +1615,17 @@ out_retry:
> >  	}
> >  }
> >  
> > +/*
> > + * 8.	Wait for pages to be released by the network stack.
> > + */
> > +static void
> > +call_complete(struct rpc_task *task)
> > +{
> > +	dprintk("RPC: %5u call_complete result %d\n",
> > +		task->tk_pid, task->tk_status);
> > +	task->tk_action = rpc_exit_task;
> > +}
> > +
> >  static __be32 *
> >  rpc_encode_header(struct rpc_task *task)
> >  {
> > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> > index 852a258..3685cad 100644
> > --- a/net/sunrpc/svcsock.c
> > +++ b/net/sunrpc/svcsock.c
> > @@ -196,7 +196,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
> >  	while (pglen > 0) {
> >  		if (slen == size)
> >  			flags = 0;
> > -		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> > +		result = kernel_sendpage(sock, *ppage, xdr->destructor,
> > +					 base, size, flags);
> >  		if (result > 0)
> >  			len += result;
> >  		if (result != size)
> > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> > index f4385e4..925aa0c 100644
> > --- a/net/sunrpc/xprt.c
> > +++ b/net/sunrpc/xprt.c
> > @@ -1103,6 +1103,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
> >  	xprt->xid = net_random();
> >  }
> >  
> > +static int xprt_complete_skb_pages(void *calldata)
> > +{
> > +	struct rpc_task *task = calldata;
> > +	struct rpc_rqst	*req = task->tk_rqstp;
> > +
> > +	dprintk("RPC: %5u completing skb pages\n", task->tk_pid);
> > +	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
> > +	return 0;
> > +}
> > +
> >  static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> >  {
> >  	struct rpc_rqst	*req = task->tk_rqstp;
> > @@ -1115,6 +1125,9 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
> >  	req->rq_xid     = xprt_alloc_xid(xprt);
> >  	req->rq_release_snd_buf = NULL;
> >  	xprt_reset_majortimeo(req);
> > +	atomic_set(&req->destructor.ref, 1);
> > +	req->destructor.destroy = &xprt_complete_skb_pages;
> > +	req->destructor.data = task;
> >  	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
> >  			req, ntohl(req->rq_xid));
> >  }
> > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > index f79e40e9..af3a106 100644
> > --- a/net/sunrpc/xprtsock.c
> > +++ b/net/sunrpc/xprtsock.c
> > @@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> >  		remainder -= len;
> >  		if (remainder != 0 || more)
> >  			flags |= MSG_MORE;
> > -		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
> > +		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
> > +					  base, len, flags);
> >  		if (remainder == 0 || err != len)
> >  			break;
> >  		sent += err;
> > -- 
> > 1.7.2.5
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes
From: Krishna Kumar @ 2011-11-11 13:04 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, kvm, davem, Krishna Kumar, virtualization
In-Reply-To: <20111111130223.9878.59517.sendpatchset@krkumar2.in.ibm.com>

Changes for multiqueue virtio_net driver.

Signed-off-by: krkumar2@in.ibm.com
---
 drivers/net/virtio_net.c   |  688 ++++++++++++++++++++++++-----------
 include/linux/virtio_net.h |    2 
 2 files changed, 481 insertions(+), 209 deletions(-)

diff -ruNp org/drivers/net/virtio_net.c new/drivers/net/virtio_net.c
--- org/drivers/net/virtio_net.c	2011-11-11 16:44:38.000000000 +0530
+++ new/drivers/net/virtio_net.c	2011-11-11 16:44:59.000000000 +0530
@@ -40,33 +40,42 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX    2
 
-struct virtnet_stats {
+struct virtnet_send_stats {
 	struct u64_stats_sync syncp;
 	u64 tx_bytes;
 	u64 tx_packets;
+};
 
+struct virtnet_recv_stats {
+	struct u64_stats_sync syncp;
 	u64 rx_bytes;
 	u64 rx_packets;
 };
 
-struct virtnet_info {
-	struct virtio_device *vdev;
-	struct virtqueue *rvq, *svq, *cvq;
-	struct net_device *dev;
-	struct napi_struct napi;
-	unsigned int status;
+/* Internal representation of a send virtqueue */
+struct send_queue {
+	/* Virtqueue associated with this send _queue */
+	struct virtqueue *vq;
 
-	/* Number of input buffers, and max we've ever had. */
-	unsigned int num, max;
+	/* TX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
-	/* I like... big packets and I cannot lie! */
-	bool big_packets;
+	/* Active tx statistics */
+	struct virtnet_send_stats __percpu *stats;
+};
 
-	/* Host will merge rx buffers for big packets (shake it! shake it!) */
-	bool mergeable_rx_bufs;
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+	/* Virtqueue associated with this receive_queue */
+	struct virtqueue *vq;
+
+	/* Back pointer to the virtnet_info */
+	struct virtnet_info *vi;
 
-	/* Active statistics */
-	struct virtnet_stats __percpu *stats;
+	struct napi_struct napi;
+
+	/* Number of input buffers, and max we've ever had. */
+	unsigned int num, max;
 
 	/* Work struct for refilling if we run low on memory. */
 	struct delayed_work refill;
@@ -74,9 +83,29 @@ struct virtnet_info {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
-	/* fragments + linear part + virtio header */
-	struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-	struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
+	/* RX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+
+	/* Active rx statistics */
+	struct virtnet_recv_stats __percpu *stats;
+};
+
+struct virtnet_info {
+	int num_queue_pairs;		/* # of RX/TX vq pairs */
+
+	struct send_queue **sq;
+	struct receive_queue **rq;
+	struct virtqueue *cvq;
+
+	struct virtio_device *vdev;
+	struct net_device *dev;
+	unsigned int status;
+
+	/* I like... big packets and I cannot lie! */
+	bool big_packets;
+
+	/* Host will merge rx buffers for big packets (shake it! shake it!) */
+	bool mergeable_rx_bufs;
 };
 
 struct skb_vnet_hdr {
@@ -106,22 +135,22 @@ static inline struct skb_vnet_hdr *skb_v
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
 	struct page *end;
 
 	/* Find end of list, sew whole thing into vi->pages. */
 	for (end = page; end->private; end = (struct page *)end->private);
-	end->private = (unsigned long)vi->pages;
-	vi->pages = page;
+	end->private = (unsigned long)rq->pages;
+	rq->pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-	struct page *p = vi->pages;
+	struct page *p = rq->pages;
 
 	if (p) {
-		vi->pages = (struct page *)p->private;
+		rq->pages = (struct page *)p->private;
 		/* clear private here, it is used to chain pages */
 		p->private = 0;
 	} else
@@ -129,15 +158,16 @@ static struct page *get_a_page(struct vi
 	return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-	struct virtnet_info *vi = svq->vdev->priv;
+	struct virtnet_info *vi = vq->vdev->priv;
+	int qnum = vq->queue_index / 2; /* RX/TX vqs are allocated in pairs */
 
 	/* Suppress further interrupts. */
-	virtqueue_disable_cb(svq);
+	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	netif_wake_subqueue(vi->dev, qnum);
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -155,9 +185,10 @@ static void set_skb_frag(struct sk_buff 
 	*len -= size;
 }
 
-static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int len)
 {
+	struct virtnet_info *vi = rq->vi;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	unsigned int copy, hdr_len, offset;
@@ -213,12 +244,12 @@ static struct sk_buff *page_to_skb(struc
 	}
 
 	if (page)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return skb;
 }
 
-static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	struct page *page;
@@ -232,7 +263,7 @@ static int receive_mergeable(struct virt
 			skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		page = virtqueue_get_buf(vi->rvq, &len);
+		page = virtqueue_get_buf(rq->vq, &len);
 		if (!page) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 skb->dev->name, hdr->mhdr.num_buffers);
@@ -245,15 +276,16 @@ static int receive_mergeable(struct virt
 
 		set_skb_frag(skb, page, 0, &len);
 
-		--vi->num;
+		--rq->num;
 	}
 	return 0;
 }
 
-static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 {
+	struct net_device *dev = rq->vi->dev;
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct virtnet_stats __percpu *stats = this_cpu_ptr(vi->stats);
+	struct virtnet_recv_stats __percpu *stats = this_cpu_ptr(rq->stats);
 	struct sk_buff *skb;
 	struct page *page;
 	struct skb_vnet_hdr *hdr;
@@ -262,7 +294,7 @@ static void receive_buf(struct net_devic
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(rq, buf);
 		else
 			dev_kfree_skb(buf);
 		return;
@@ -274,14 +306,14 @@ static void receive_buf(struct net_devic
 		skb_trim(skb, len);
 	} else {
 		page = buf;
-		skb = page_to_skb(vi, page, len);
+		skb = page_to_skb(rq, page, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
-			give_pages(vi, page);
+			give_pages(rq, page);
 			return;
 		}
 		if (vi->mergeable_rx_bufs)
-			if (receive_mergeable(vi, skb)) {
+			if (receive_mergeable(rq, skb)) {
 				dev_kfree_skb(skb);
 				return;
 			}
@@ -351,184 +383,200 @@ frame_err:
 	dev_kfree_skb(skb);
 }
 
-static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 {
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
 
-	skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
+	skb = netdev_alloc_skb_ip_align(rq->vi->dev, MAX_PACKET_LEN);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
 	skb_put(skb, MAX_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
-	sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
+	err = virtqueue_add_buf_gfp(rq->vq, rq->sg, 0, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
 	return err;
 }
 
-static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *first, *list = NULL;
 	char *p;
 	int i, err, offset;
 
-	/* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
+	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
 	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
-		first = get_a_page(vi, gfp);
+		first = get_a_page(rq, gfp);
 		if (!first) {
 			if (list)
-				give_pages(vi, list);
+				give_pages(rq, list);
 			return -ENOMEM;
 		}
-		sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
+		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
 		first->private = (unsigned long)list;
 		list = first;
 	}
 
-	first = get_a_page(vi, gfp);
+	first = get_a_page(rq, gfp);
 	if (!first) {
-		give_pages(vi, list);
+		give_pages(rq, list);
 		return -ENOMEM;
 	}
 	p = page_address(first);
 
-	/* vi->rx_sg[0], vi->rx_sg[1] share the same page */
-	/* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
-	sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
+	/* rq->sg[0], rq->sg[1] share the same page */
+	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
+	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
 
-	/* vi->rx_sg[1] for data packet, from offset */
+	/* rq->sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
-	sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
+	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
+	err = virtqueue_add_buf_gfp(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
 				    first, gfp);
 	if (err < 0)
-		give_pages(vi, first);
+		give_pages(rq, first);
 
 	return err;
 }
 
-static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *page;
 	int err;
 
-	page = get_a_page(vi, gfp);
+	page = get_a_page(rq, gfp);
 	if (!page)
 		return -ENOMEM;
 
-	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
+	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
+	err = virtqueue_add_buf_gfp(rq->vq, rq->sg, 0, 1, page, gfp);
 	if (err < 0)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return err;
 }
 
 /* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vi;
 	int err;
 	bool oom;
 
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
+			err = add_recvbuf_mergeable(rq, gfp);
 		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
+			err = add_recvbuf_big(rq, gfp);
 		else
-			err = add_recvbuf_small(vi, gfp);
+			err = add_recvbuf_small(rq, gfp);
 
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
+		++rq->num;
 	} while (err > 0);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	virtqueue_kick(vi->rvq);
+	if (unlikely(rq->num > rq->max))
+		rq->max = rq->num;
+	virtqueue_kick(rq->vq);
 	return !oom;
 }
 
-static void skb_recv_done(struct virtqueue *rvq)
+static void skb_recv_done(struct virtqueue *vq)
 {
-	struct virtnet_info *vi = rvq->vdev->priv;
+	int qnum = vq->queue_index / 2; /* RX/TX vqs are allocated in pairs */
+	struct virtnet_info *vi = vq->vdev->priv;
+	struct napi_struct *napi = &vi->rq[qnum]->napi;
+
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (napi_schedule_prep(&vi->napi)) {
-		virtqueue_disable_cb(rvq);
-		__napi_schedule(&vi->napi);
+	if (napi_schedule_prep(napi)) {
+		virtqueue_disable_cb(vq);
+		__napi_schedule(napi);
 	}
 }
 
-static void virtnet_napi_enable(struct virtnet_info *vi)
+static void virtnet_napi_enable(struct receive_queue *rq)
 {
-	napi_enable(&vi->napi);
+	napi_enable(&rq->napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (napi_schedule_prep(&vi->napi)) {
-		virtqueue_disable_cb(vi->rvq);
-		__napi_schedule(&vi->napi);
+	if (napi_schedule_prep(&rq->napi)) {
+		virtqueue_disable_cb(rq->vq);
+		__napi_schedule(&rq->napi);
 	}
 }
 
+static void virtnet_napi_enable_all_queues(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->num_queue_pairs; i++)
+		virtnet_napi_enable(vi->rq[i]);
+}
+
 static void refill_work(struct work_struct *work)
 {
-	struct virtnet_info *vi;
+	struct napi_struct *napi;
+	struct receive_queue *rq;
 	bool still_empty;
 
-	vi = container_of(work, struct virtnet_info, refill.work);
-	napi_disable(&vi->napi);
-	still_empty = !try_fill_recv(vi, GFP_KERNEL);
-	virtnet_napi_enable(vi);
+	rq = container_of(work, struct receive_queue, refill.work);
+	napi = &rq->napi;
+
+	napi_disable(napi);
+	still_empty = !try_fill_recv(rq, GFP_KERNEL);
+	virtnet_napi_enable(rq);
 
 	/* In theory, this can happen: if we don't get any buffers in
 	 * we will *never* try to fill again. */
 	if (still_empty)
-		schedule_delayed_work(&vi->refill, HZ/2);
+		schedule_delayed_work(&rq->refill, HZ/2);
 }
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
-	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
+	struct receive_queue *rq = container_of(napi, struct receive_queue,
+						napi);
 	void *buf;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) {
-		receive_buf(vi->dev, buf, len);
-		--vi->num;
+	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
+		receive_buf(rq, buf, len);
+		--rq->num;
 		received++;
 	}
 
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
-			schedule_delayed_work(&vi->refill, 0);
+	if (rq->num < rq->max / 2) {
+		if (!try_fill_recv(rq, GFP_ATOMIC))
+			schedule_delayed_work(&rq->refill, 0);
 	}
 
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);
-		if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
+		if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
 		    napi_schedule_prep(napi)) {
-			virtqueue_disable_cb(vi->rvq);
+			virtqueue_disable_cb(rq->vq);
 			__napi_schedule(napi);
 			goto again;
 		}
@@ -537,13 +585,14 @@ again:
 	return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct send_queue *sq,
+				       struct virtqueue *vq)
 {
 	struct sk_buff *skb;
 	unsigned int len, tot_sgs = 0;
-	struct virtnet_stats __percpu *stats = this_cpu_ptr(vi->stats);
+	struct virtnet_send_stats __percpu *stats = this_cpu_ptr(sq->stats);
 
-	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
+	while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
 
 		u64_stats_update_begin(&stats->syncp);
@@ -557,7 +606,8 @@ static unsigned int free_old_xmit_skbs(s
 	return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb,
+		    struct virtqueue *vq, struct scatterlist *sg)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
@@ -595,44 +645,47 @@ static int xmit_skb(struct virtnet_info 
 
 	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
+		sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
 	else
-		sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
+		sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
 
-	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
-	return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
+	hdr->num_sg = skb_to_sgvec(skb, sg + 1, 0, skb->len) + 1;
+	return virtqueue_add_buf(vq, sg, hdr->num_sg,
 					0, skb);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int qnum = skb_get_queue_mapping(skb);
+	struct virtqueue *vq = vi->sq[qnum]->vq;
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
+	free_old_xmit_skbs(vi->sq[qnum], vq);
 
 	/* Try to transmit */
-	capacity = xmit_skb(vi, skb);
+	capacity = xmit_skb(vi, skb, vq, vi->sq[qnum]->sg);
 
 	/* This can happen with OOM and indirect buffers. */
 	if (unlikely(capacity < 0)) {
 		if (net_ratelimit()) {
 			if (likely(capacity == -ENOMEM)) {
 				dev_warn(&dev->dev,
-					 "TX queue failure: out of memory\n");
+					 "TXQ (%d) failure: out of memory\n",
+					 qnum);
 			} else {
 				dev->stats.tx_fifo_errors++;
 				dev_warn(&dev->dev,
-					 "Unexpected TX queue failure: %d\n",
-					 capacity);
+					 "Unexpected TXQ (%d) failure: %d\n",
+					 qnum, capacity);
 			}
 		}
 		dev->stats.tx_dropped++;
 		kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
-	virtqueue_kick(vi->svq);
+	virtqueue_kick(vq);
 
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
@@ -641,13 +694,13 @@ static netdev_tx_t start_xmit(struct sk_
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
+		netif_stop_subqueue(dev, qnum);
+		if (unlikely(!virtqueue_enable_cb_delayed(vq))) {
 			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
+			capacity += free_old_xmit_skbs(vi->sq[qnum], vq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
+				netif_start_subqueue(dev, qnum);
+				virtqueue_disable_cb(vq);
 			}
 		}
 	}
@@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int cpu;
-	unsigned int start;
 
 	for_each_possible_cpu(cpu) {
-		struct virtnet_stats __percpu *stats
-			= per_cpu_ptr(vi->stats, cpu);
-		u64 tpackets, tbytes, rpackets, rbytes;
-
-		do {
-			start = u64_stats_fetch_begin(&stats->syncp);
-			tpackets = stats->tx_packets;
-			tbytes   = stats->tx_bytes;
-			rpackets = stats->rx_packets;
-			rbytes   = stats->rx_bytes;
-		} while (u64_stats_fetch_retry(&stats->syncp, start));
-
-		tot->rx_packets += rpackets;
-		tot->tx_packets += tpackets;
-		tot->rx_bytes   += rbytes;
-		tot->tx_bytes   += tbytes;
+		int qpair;
+
+		for (qpair = 0; qpair < vi->num_queue_pairs; qpair++) {
+			struct virtnet_send_stats __percpu *tx_stat;
+			struct virtnet_recv_stats __percpu *rx_stat;
+			u64 tpackets, tbytes, rpackets, rbytes;
+			unsigned int start;
+
+			tx_stat = per_cpu_ptr(vi->sq[qpair]->stats, cpu);
+			do {
+				start = u64_stats_fetch_begin(&tx_stat->syncp);
+				tpackets = tx_stat->tx_packets;
+				tbytes   = tx_stat->tx_bytes;
+			} while (u64_stats_fetch_retry(&tx_stat->syncp, start));
+
+			rx_stat = per_cpu_ptr(vi->rq[qpair]->stats, cpu);
+			do {
+				start = u64_stats_fetch_begin(&rx_stat->syncp);
+				rpackets = rx_stat->rx_packets;
+				rbytes   = rx_stat->rx_bytes;
+			} while (u64_stats_fetch_retry(&rx_stat->syncp, start));
+
+			tot->rx_packets += rpackets;
+			tot->tx_packets += tpackets;
+			tot->rx_bytes   += rbytes;
+			tot->tx_bytes   += tbytes;
+		}
 	}
 
 	tot->tx_dropped = dev->stats.tx_dropped;
@@ -710,16 +773,35 @@ static struct rtnl_link_stats64 *virtnet
 static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
-	napi_schedule(&vi->napi);
+	for (i = 0; i < vi->num_queue_pairs; i++)
+		napi_schedule(&vi->rq[i]->napi);
 }
 #endif
 
+static void free_stats(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		if (vi->sq && vi->sq[i]) {
+			free_percpu(vi->sq[i]->stats);
+			vi->sq[i]->stats = NULL;
+		}
+
+		if (vi->rq && vi->rq[i]) {
+			free_percpu(vi->rq[i]->stats);
+			vi->rq[i]->stats = NULL;
+		}
+	}
+}
+
 static int virtnet_open(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	virtnet_napi_enable(vi);
+	virtnet_napi_enable_all_queues(vi);
 	return 0;
 }
 
@@ -771,8 +853,10 @@ static bool virtnet_send_command(struct 
 static int virtnet_close(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
-	napi_disable(&vi->napi);
+	for (i = 0; i < vi->num_queue_pairs; i++)
+		napi_disable(&vi->rq[i]->napi);
 
 	return 0;
 }
@@ -882,11 +966,10 @@ static void virtnet_get_ringparam(struct
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0]->vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0]->vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
-
 }
 
 static const struct ethtool_ops virtnet_ethtool_ops = {
@@ -940,10 +1023,10 @@ static void virtnet_update_status(struct
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
 		netif_carrier_on(vi->dev);
-		netif_wake_queue(vi->dev);
+		netif_tx_wake_all_queues(vi->dev);
 	} else {
 		netif_carrier_off(vi->dev);
-		netif_stop_queue(vi->dev);
+		netif_tx_stop_all_queues(vi->dev);
 	}
 }
 
@@ -954,18 +1037,232 @@ static void virtnet_config_changed(struc
 	virtnet_update_status(vi);
 }
 
+static void free_receive_bufs(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		while (vi->rq[i]->pages)
+			__free_pages(get_a_page(vi->rq[i], GFP_KERNEL), 0);
+	}
+}
+
+/* Free memory allocated for send and receive queues */
+static void free_rq_sq(struct virtnet_info *vi)
+{
+	int i;
+
+	free_stats(vi);
+
+	if (vi->rq) {
+		for (i = 0; i < vi->num_queue_pairs; i++)
+			kfree(vi->rq[i]);
+		kfree(vi->rq);
+	}
+
+	if (vi->sq) {
+		for (i = 0; i < vi->num_queue_pairs; i++)
+			kfree(vi->sq[i]);
+		kfree(vi->sq);
+	}
+}
+
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+	void *buf;
+	int i;
+
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		struct virtqueue *vq = vi->sq[i]->vq;
+
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
+			dev_kfree_skb(buf);
+	}
+
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		struct virtqueue *vq = vi->rq[i]->vq;
+
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+			if (vi->mergeable_rx_bufs || vi->big_packets)
+				give_pages(vi->rq[i], buf);
+			else
+				dev_kfree_skb(buf);
+			--vi->rq[i]->num;
+		}
+		BUG_ON(vi->rq[i]->num != 0);
+	}
+}
+
+static void setup_rx_vqs(struct virtnet_info *vi, struct virtqueue **vqs,
+			 int total_vqs)
+{
+	int i;
+
+	for (i = 0; i < total_vqs; i += 2)
+		vi->rq[i/2]->vq = vqs[i];
+}
+
+static void setup_tx_vqs(struct virtnet_info *vi, struct virtqueue **vqs,
+			 int total_vqs)
+{
+	int i;
+
+	for (i = 0; i < total_vqs; i += 2)
+		vi->sq[i/2]->vq = vqs[i + 1];
+}
+
+static void setup_cvq(struct virtnet_info *vi, struct virtqueue **vqs,
+		      int index)
+{
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ))
+		vi->cvq = vqs[index];
+}
+
+static int invoke_find_vqs(struct virtnet_info *vi)
+{
+	vq_callback_t **callbacks;
+	struct virtqueue **vqs;
+	int ret = -ENOMEM;
+	int i, total_vqs;
+	char **names;
+
+	/*
+	 * We expect 1 RX virtqueue followed by 1 TX virtqueue, followed
+	 * by the same 'vi->num_queue_pairs-1' more times, and optionally
+	 * one control virtqueue.
+	 */
+	total_vqs = vi->num_queue_pairs * 2 +
+		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kmalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
+	callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
+	names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
+	if (!vqs || !callbacks || !names)
+		goto err;
+
+	/* Allocate/initialize parameters for recv virtqueues */
+	for (i = 0; i < vi->num_queue_pairs * 2; i += 2) {
+		callbacks[i] = skb_recv_done;
+		names[i] = kasprintf(GFP_KERNEL, "input.%d", i / 2);
+		if (!names[i])
+			goto err;
+	}
+
+	/* Allocate/initialize parameters for send virtqueues */
+	for (i = 1; i < vi->num_queue_pairs * 2; i += 2) {
+		callbacks[i] = skb_xmit_done;
+		names[i] = kasprintf(GFP_KERNEL, "output.%d", i / 2);
+		if (!names[i])
+			goto err;
+	}
+
+	/* Parameters for control virtqueue, if any */
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
+		callbacks[i - 1] = NULL;
+		names[i - 1] = "control";
+	}
+
+	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
+					 (const char **)names);
+
+	if (ret)
+		goto err;
+
+	setup_rx_vqs(vi, vqs, vi->num_queue_pairs * 2);
+	setup_tx_vqs(vi, vqs, vi->num_queue_pairs * 2);
+	setup_cvq(vi, vqs, vi->num_queue_pairs * 2);
+
+err:
+	if (ret && names)
+		for (i = 0; i < vi->num_queue_pairs * 2; i++)
+			kfree(names[i]);
+
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+
+	return ret;
+}
+
+static int allocate_queues(struct virtnet_info *vi)
+{
+	int ret = -ENOMEM;
+	int i;
+
+	vi->rq = kcalloc(vi->num_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
+	vi->sq = kcalloc(vi->num_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
+	if (!vi->sq || !vi->rq)
+		goto err;
+
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		vi->rq[i] = kzalloc(sizeof(*vi->rq[i]), GFP_KERNEL);
+		vi->sq[i] = kzalloc(sizeof(*vi->sq[i]), GFP_KERNEL);
+		if (!vi->rq[i] || !vi->sq[i])
+			goto err;
+
+		vi->rq[i]->stats = alloc_percpu(struct virtnet_recv_stats);
+		vi->sq[i]->stats = alloc_percpu(struct virtnet_send_stats);
+		if (!vi->rq[i]->stats || !vi->sq[i]->stats)
+			goto err;
+	}
+
+	ret = 0;
+
+	/* setup initial receive and send queue parameters */
+	for (i = 0; i < vi->num_queue_pairs; i++) {
+		vi->rq[i]->vi = vi;
+		vi->rq[i]->pages = NULL;
+		INIT_DELAYED_WORK(&vi->rq[i]->refill, refill_work);
+		netif_napi_add(vi->dev, &vi->rq[i]->napi, virtnet_poll,
+			       napi_weight);
+
+		sg_init_table(vi->rq[i]->sg, ARRAY_SIZE(vi->rq[i]->sg));
+		sg_init_table(vi->sq[i]->sg, ARRAY_SIZE(vi->sq[i]->sg));
+	}
+
+err:
+	if (ret)
+		free_rq_sq(vi);
+
+	return ret;
+}
+
+static int virtnet_setup_vqs(struct virtnet_info *vi)
+{
+	int ret;
+
+	/* Allocate send & receive queues */
+	ret = allocate_queues(vi);
+	if (!ret) {
+		ret = invoke_find_vqs(vi);
+		if (ret)
+			free_rq_sq(vi);
+	}
+
+	return ret;
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
-	int err;
+	int i, err;
 	struct net_device *dev;
 	struct virtnet_info *vi;
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
-	const char *names[] = { "input", "output", "control" };
-	int nvqs;
+	u16 num_queues, num_queue_pairs;
+
+	/* Find if host supports multiqueue virtio_net device */
+	err = virtio_config_val(vdev, VIRTIO_NET_F_MULTIQUEUE,
+				offsetof(struct virtio_net_config,
+				num_queues), &num_queues);
+
+	/* We need atleast 2 queue's */
+	if (err || num_queues < 2)
+		num_queues = 2;
+
+	num_queue_pairs = num_queues / 2;
 
 	/* Allocate ourselves a network device with room for our info */
-	dev = alloc_etherdev(sizeof(struct virtnet_info));
+	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), num_queue_pairs);
 	if (!dev)
 		return -ENOMEM;
 
@@ -1011,19 +1308,10 @@ static int virtnet_probe(struct virtio_d
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->pages = NULL;
-	vi->stats = alloc_percpu(struct virtnet_stats);
-	err = -ENOMEM;
-	if (vi->stats == NULL)
-		goto free;
-
-	INIT_DELAYED_WORK(&vi->refill, refill_work);
-	sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
-	sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
+	vi->num_queue_pairs = num_queue_pairs;
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1034,23 +1322,14 @@ static int virtnet_probe(struct virtio_d
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
-	/* We expect two virtqueues, receive then send,
-	 * and optionally control. */
-	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
-
-	err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names);
+	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
+	err = virtnet_setup_vqs(vi);
 	if (err)
-		goto free_stats;
-
-	vi->rvq = vqs[0];
-	vi->svq = vqs[1];
+		goto free_netdev;
 
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vqs[2];
-
-		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
-			dev->features |= NETIF_F_HW_VLAN_FILTER;
-	}
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) &&
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
+		dev->features |= NETIF_F_HW_VLAN_FILTER;
 
 	err = register_netdev(dev);
 	if (err) {
@@ -1059,12 +1338,15 @@ static int virtnet_probe(struct virtio_d
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(vi, GFP_KERNEL);
+	for (i = 0; i < num_queue_pairs; i++) {
+		try_fill_recv(vi->rq[i], GFP_KERNEL);
 
-	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->num == 0) {
-		err = -ENOMEM;
-		goto unregister;
+		/* If we didn't even get one input buffer, we're useless. */
+		if (vi->rq[i]->num == 0) {
+			free_unused_bufs(vi);
+			err = -ENOMEM;
+			goto free_recv_bufs;
+		}
 	}
 
 	/* Assume link up if device can't report link status,
@@ -1077,63 +1359,51 @@ static int virtnet_probe(struct virtio_d
 		netif_carrier_on(dev);
 	}
 
-	pr_debug("virtnet: registered device %s\n", dev->name);
+	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
+		 dev->name, num_queue_pairs);
+
 	return 0;
 
-unregister:
+free_recv_bufs:
+	free_receive_bufs(vi);
 	unregister_netdev(dev);
-	cancel_delayed_work_sync(&vi->refill);
+
 free_vqs:
+	for (i = 0; i < num_queue_pairs; i++)
+		cancel_delayed_work_sync(&vi->rq[i]->refill);
 	vdev->config->del_vqs(vdev);
-free_stats:
-	free_percpu(vi->stats);
-free:
+
+free_netdev:
+	free_rq_sq(vi);
+
 	free_netdev(dev);
 	return err;
 }
 
-static void free_unused_bufs(struct virtnet_info *vi)
-{
-	void *buf;
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->svq);
-		if (!buf)
-			break;
-		dev_kfree_skb(buf);
-	}
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rvq);
-		if (!buf)
-			break;
-		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
-		else
-			dev_kfree_skb(buf);
-		--vi->num;
-	}
-	BUG_ON(vi->num != 0);
-}
-
 static void __devexit virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	int i;
 
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
 
 	unregister_netdev(vi->dev);
-	cancel_delayed_work_sync(&vi->refill);
+
+	for (i = 0; i < vi->num_queue_pairs; i++)
+		cancel_delayed_work_sync(&vi->rq[i]->refill);
 
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
 	vdev->config->del_vqs(vi->vdev);
 
-	while (vi->pages)
-		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
+	free_receive_bufs(vi);
+
+	/* Free memory for send and receive queues */
+	free_rq_sq(vi);
 
-	free_percpu(vi->stats);
 	free_netdev(vi->dev);
 }
 
@@ -1149,7 +1419,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
-	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
+	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_MULTIQUEUE,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff -ruNp org/include/linux/virtio_net.h new/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h	2011-11-11 16:44:38.000000000 +0530
+++ new/include/linux/virtio_net.h	2011-11-11 16:44:59.000000000 +0530
@@ -58,6 +58,8 @@ struct virtio_net_config {
 	__u8 mac[6];
 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
 	__u16 status;
+	/* Total number of RX/TX queues */
+	__u16 num_queues;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't

^ permalink raw reply

* [RFC] [ver3 PATCH 4/6] vhost_net: vhost_net changes
From: Krishna Kumar @ 2011-11-11 13:05 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, Krishna Kumar, davem, kvm, virtualization
In-Reply-To: <20111111130223.9878.59517.sendpatchset@krkumar2.in.ibm.com>

Changes for multiqueue vhost_net driver.

Signed-off-by: krkumar2@in.ibm.com
---
 drivers/vhost/net.c   |  253 +++++++++++++++++++++++++---------------
 drivers/vhost/vhost.c |  225 ++++++++++++++++++++++++-----------
 drivers/vhost/vhost.h |   26 +++-
 3 files changed, 340 insertions(+), 164 deletions(-)

diff -ruNp org/drivers/vhost/net.c new/drivers/vhost/net.c
--- org/drivers/vhost/net.c	2011-11-11 16:44:56.000000000 +0530
+++ new/drivers/vhost/net.c	2011-11-11 16:45:11.000000000 +0530
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "
 #define VHOST_MAX_PEND 128
 #define VHOST_GOODCOPY_LEN 256
 
-enum {
-	VHOST_NET_VQ_RX = 0,
-	VHOST_NET_VQ_TX = 1,
-	VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
 	VHOST_NET_POLL_DISABLED = 0,
 	VHOST_NET_POLL_STARTED = 1,
@@ -55,12 +49,13 @@ enum vhost_net_poll_state {
 
 struct vhost_net {
 	struct vhost_dev dev;
-	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
-	struct vhost_poll poll[VHOST_NET_VQ_MAX];
+	struct vhost_virtqueue *vqs;
+	struct vhost_poll *poll;
+	struct socket **socks;
 	/* Tells us whether we are polling a socket for TX.
 	 * We only do this when socket buffer fills up.
 	 * Protected by tx vq lock. */
-	enum vhost_net_poll_state tx_poll_state;
+	enum vhost_net_poll_state *tx_poll_state;
 };
 
 static bool vhost_sock_zcopy(struct socket *sock)
@@ -108,28 +103,28 @@ static void copy_iovec_hdr(const struct 
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_stop(struct vhost_net *net)
+static void tx_poll_stop(struct vhost_net *net, int qnum)
 {
-	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
+	if (likely(net->tx_poll_state[qnum / 2] != VHOST_NET_POLL_STARTED))
 		return;
-	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
-	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
+	vhost_poll_stop(&net->poll[qnum]);
+	net->tx_poll_state[qnum / 2] = VHOST_NET_POLL_STOPPED;
 }
 
 /* Caller must have TX VQ lock */
-static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+static void tx_poll_start(struct vhost_net *net, struct socket *sock, int qnum)
 {
-	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
+	if (unlikely(net->tx_poll_state[qnum / 2] != VHOST_NET_POLL_STOPPED))
 		return;
-	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
-	net->tx_poll_state = VHOST_NET_POLL_STARTED;
+	vhost_poll_start(&net->poll[qnum], sock->file);
+	net->tx_poll_state[qnum / 2] = VHOST_NET_POLL_STARTED;
 }
 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_tx(struct vhost_net *net)
+static void handle_tx(struct vhost_virtqueue *vq)
 {
-	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
 	unsigned out, in, s;
 	int head;
 	struct msghdr msg = {
@@ -155,7 +150,7 @@ static void handle_tx(struct vhost_net *
 	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
 	if (wmem >= sock->sk->sk_sndbuf) {
 		mutex_lock(&vq->mutex);
-		tx_poll_start(net, sock);
+		tx_poll_start(net, sock, vq->qnum);
 		mutex_unlock(&vq->mutex);
 		return;
 	}
@@ -164,7 +159,7 @@ static void handle_tx(struct vhost_net *
 	vhost_disable_notify(&net->dev, vq);
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
-		tx_poll_stop(net);
+		tx_poll_stop(net, vq->qnum);
 	hdr_size = vq->vhost_hlen;
 	zcopy = vhost_sock_zcopy(sock);
 
@@ -186,7 +181,7 @@ static void handle_tx(struct vhost_net *
 
 			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
 			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
-				tx_poll_start(net, sock);
+				tx_poll_start(net, sock, vq->qnum);
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
@@ -197,7 +192,7 @@ static void handle_tx(struct vhost_net *
 				    (vq->upend_idx - vq->done_idx) :
 				    (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
 			if (unlikely(num_pends > VHOST_MAX_PEND)) {
-				tx_poll_start(net, sock);
+				tx_poll_start(net, sock, vq->qnum);
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
@@ -257,7 +252,7 @@ static void handle_tx(struct vhost_net *
 					UIO_MAXIOV;
 			}
 			vhost_discard_vq_desc(vq, 1);
-			tx_poll_start(net, sock);
+			tx_poll_start(net, sock, vq->qnum);
 			break;
 		}
 		if (err != len)
@@ -353,9 +348,9 @@ err:
 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_rx(struct vhost_net *net)
+static void handle_rx(struct vhost_virtqueue *vq)
 {
-	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
 	unsigned uninitialized_var(in), log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -464,87 +459,155 @@ static void handle_tx_kick(struct vhost_
 {
 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
 						  poll.work);
-	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
 
-	handle_tx(net);
+	handle_tx(vq);
 }
 
 static void handle_rx_kick(struct vhost_work *work)
 {
 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
 						  poll.work);
-	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
 
-	handle_rx(net);
+	handle_rx(vq);
 }
 
 static void handle_tx_net(struct vhost_work *work)
 {
-	struct vhost_net *net = container_of(work, struct vhost_net,
-					     poll[VHOST_NET_VQ_TX].work);
-	handle_tx(net);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_poll,
+						  work)->vq;
+
+	handle_tx(vq);
 }
 
 static void handle_rx_net(struct vhost_work *work)
 {
-	struct vhost_net *net = container_of(work, struct vhost_net,
-					     poll[VHOST_NET_VQ_RX].work);
-	handle_rx(net);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_poll,
+						  work)->vq;
+
+	handle_rx(vq);
 }
 
-static int vhost_net_open(struct inode *inode, struct file *f)
+void vhost_free_vqs(struct vhost_dev *dev)
 {
-	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
-	struct vhost_dev *dev;
-	int r;
+	struct vhost_net *n = container_of(dev, struct vhost_net, dev);
 
-	if (!n)
-		return -ENOMEM;
+	if (!n->vqs)
+		return;
 
-	dev = &n->dev;
-	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
-	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
-	if (r < 0) {
-		kfree(n);
-		return r;
+	kfree(n->socks);
+	kfree(n->tx_poll_state);
+	kfree(n->poll);
+	kfree(n->vqs);
+
+	/*
+	 * Reset so that vhost_net_release (which gets called when
+	 * vhost_dev_set_owner() call fails) will notice.
+	 */
+	n->vqs = NULL;
+}
+
+int vhost_setup_vqs(struct vhost_dev *dev, int numtxqs)
+{
+	struct vhost_net *n = container_of(dev, struct vhost_net, dev);
+	int i, nvqs;
+	int ret = -ENOMEM;
+
+	if (numtxqs < 0)
+		return -EINVAL;
+
+	if (numtxqs == 0) {
+		/* Old qemu doesn't pass arguments to set_owner, use 1 txq */
+		numtxqs = 1;
+	}
+
+	/* Get total number of virtqueues */
+	nvqs = numtxqs * 2;
+
+	n->vqs = kmalloc(nvqs * sizeof(*n->vqs), GFP_KERNEL);
+	n->poll = kmalloc(nvqs * sizeof(*n->poll), GFP_KERNEL);
+	n->socks = kmalloc(nvqs * sizeof(*n->socks), GFP_KERNEL);
+	n->tx_poll_state = kmalloc(numtxqs * sizeof(*n->tx_poll_state),
+				   GFP_KERNEL);
+	if (!n->vqs || !n->poll || !n->socks || !n->tx_poll_state)
+		goto err;
+
+	/* RX followed by TX queues */
+	for (i = 0; i < nvqs; i += 2) {
+		n->vqs[i].handle_kick = handle_rx_kick;
+		n->vqs[i + 1].handle_kick = handle_tx_kick;
 	}
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
-	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+	ret = vhost_dev_init(dev, n->vqs, nvqs);
+	if (ret < 0)
+		goto err;
 
-	f->private_data = n;
+	for (i = 0; i < nvqs; i += 2) {
+		vhost_poll_init(&n->poll[i], handle_rx_net, POLLIN,
+				&n->vqs[i]);
+		vhost_poll_init(&n->poll[i+1], handle_tx_net, POLLOUT,
+				&n->vqs[i+1]);
+		if (i / 2 < numtxqs)
+			n->tx_poll_state[i/2] = VHOST_NET_POLL_DISABLED;
+	}
 
 	return 0;
+
+err:
+	/* Free all pointers that may have been allocated */
+	vhost_free_vqs(dev);
+
+	return ret;
+}
+
+static int vhost_net_open(struct inode *inode, struct file *f)
+{
+	struct vhost_net *n = kzalloc(sizeof *n, GFP_KERNEL);
+	int ret = -ENOMEM;
+
+	if (n) {
+		struct vhost_dev *dev = &n->dev;
+
+		f->private_data = n;
+		mutex_init(&dev->mutex);
+
+		/* Defer all other initialization till user does SET_OWNER */
+		ret = 0;
+	}
+
+	return ret;
 }
 
 static void vhost_net_disable_vq(struct vhost_net *n,
 				 struct vhost_virtqueue *vq)
 {
+	int qnum = vq->qnum;
+
 	if (!vq->private_data)
 		return;
-	if (vq == n->vqs + VHOST_NET_VQ_TX) {
-		tx_poll_stop(n);
-		n->tx_poll_state = VHOST_NET_POLL_DISABLED;
-	} else
-		vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
+	if (qnum & 1) {		/* Odd qnum -> TX */
+		tx_poll_stop(n, qnum);
+		n->tx_poll_state[qnum / 2] = VHOST_NET_POLL_DISABLED;
+	} else {		/* Even qnum -> RX */
+		vhost_poll_stop(&n->poll[qnum]);
+	}
 }
 
 static void vhost_net_enable_vq(struct vhost_net *n,
 				struct vhost_virtqueue *vq)
 {
 	struct socket *sock;
+	int qnum = vq->qnum;
 
 	sock = rcu_dereference_protected(vq->private_data,
 					 lockdep_is_held(&vq->mutex));
 	if (!sock)
 		return;
-	if (vq == n->vqs + VHOST_NET_VQ_TX) {
-		n->tx_poll_state = VHOST_NET_POLL_STOPPED;
-		tx_poll_start(n, sock);
-	} else
-		vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
+	if (qnum & 1) {		/* Odd qnum -> TX */
+		n->tx_poll_state[qnum / 2] = VHOST_NET_POLL_STOPPED;
+		tx_poll_start(n, sock, qnum);
+	} else {		/* Even qnum -> RX */
+		vhost_poll_start(&n->poll[qnum], sock->file);
+	}
 }
 
 static struct socket *vhost_net_stop_vq(struct vhost_net *n,
@@ -561,11 +624,12 @@ static struct socket *vhost_net_stop_vq(
 	return sock;
 }
 
-static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
-			   struct socket **rx_sock)
+static void vhost_net_stop(struct vhost_net *n)
 {
-	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
-	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
+	int i;
+
+	for (i = 0; i < n->dev.nvqs; i++)
+		n->socks[i] = vhost_net_stop_vq(n, &n->vqs[i]);
 }
 
 static void vhost_net_flush_vq(struct vhost_net *n, int index)
@@ -576,26 +640,33 @@ static void vhost_net_flush_vq(struct vh
 
 static void vhost_net_flush(struct vhost_net *n)
 {
-	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
-	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
+	int i;
+
+	for (i = 0; i < n->dev.nvqs; i++)
+		vhost_net_flush_vq(n, i);
 }
 
 static int vhost_net_release(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = f->private_data;
-	struct socket *tx_sock;
-	struct socket *rx_sock;
+	struct vhost_dev *dev = &n->dev;
+	int i;
 
-	vhost_net_stop(n, &tx_sock, &rx_sock);
+	vhost_net_stop(n);
 	vhost_net_flush(n);
-	vhost_dev_cleanup(&n->dev);
-	if (tx_sock)
-		fput(tx_sock->file);
-	if (rx_sock)
-		fput(rx_sock->file);
+	vhost_dev_cleanup(dev);
+
+	for (i = 0; i < n->dev.nvqs; i++)
+		if (n->socks[i])
+			fput(n->socks[i]->file);
+
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
+
+	/* Free all old pointers */
+	vhost_free_vqs(dev);
+
 	kfree(n);
 	return 0;
 }
@@ -677,7 +748,7 @@ static long vhost_net_set_backend(struct
 	if (r)
 		goto err;
 
-	if (index >= VHOST_NET_VQ_MAX) {
+	if (index >= n->dev.nvqs) {
 		r = -ENOBUFS;
 		goto err;
 	}
@@ -743,23 +814,25 @@ err:
 
 static long vhost_net_reset_owner(struct vhost_net *n)
 {
-	struct socket *tx_sock = NULL;
-	struct socket *rx_sock = NULL;
 	long err;
+	int i;
 
 	mutex_lock(&n->dev.mutex);
 	err = vhost_dev_check_owner(&n->dev);
-	if (err)
-		goto done;
-	vhost_net_stop(n, &tx_sock, &rx_sock);
+	if (err) {
+		mutex_unlock(&n->dev.mutex);
+		return err;
+	}
+
+	vhost_net_stop(n);
 	vhost_net_flush(n);
 	err = vhost_dev_reset_owner(&n->dev);
-done:
 	mutex_unlock(&n->dev.mutex);
-	if (tx_sock)
-		fput(tx_sock->file);
-	if (rx_sock)
-		fput(rx_sock->file);
+
+	for (i = 0; i < n->dev.nvqs; i++)
+		if (n->socks[i])
+			fput(n->socks[i]->file);
+
 	return err;
 }
 
@@ -788,7 +861,7 @@ static int vhost_net_set_features(struct
 	}
 	n->dev.acked_features = features;
 	smp_wmb();
-	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+	for (i = 0; i < n->dev.nvqs; ++i) {
 		mutex_lock(&n->vqs[i].mutex);
 		n->vqs[i].vhost_hlen = vhost_hlen;
 		n->vqs[i].sock_hlen = sock_hlen;
@@ -864,7 +937,7 @@ static struct miscdevice vhost_net_misc 
 static int vhost_net_init(void)
 {
 	if (experimental_zcopytx)
-		vhost_enable_zcopy(VHOST_NET_VQ_TX);
+		vhost_enable_zcopy(VHOST_NET_TX_VQS);
 	return misc_register(&vhost_net_misc);
 }
 module_init(vhost_net_init);
diff -ruNp org/drivers/vhost/vhost.c new/drivers/vhost/vhost.c
--- org/drivers/vhost/vhost.c	2011-11-11 16:44:56.000000000 +0530
+++ new/drivers/vhost/vhost.c	2011-11-11 16:45:11.000000000 +0530
@@ -75,12 +75,12 @@ static void vhost_work_init(struct vhost
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev)
+		     unsigned long mask, struct vhost_virtqueue *vq)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->vq = vq;
 
 	vhost_work_init(&poll->work, fn);
 }
@@ -103,30 +103,31 @@ void vhost_poll_stop(struct vhost_poll *
 	remove_wait_queue(poll->wqh, &poll->wait);
 }
 
-static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
-				unsigned seq)
+static bool vhost_work_seq_done(struct vhost_virtqueue *vq,
+				struct vhost_work *work, unsigned seq)
 {
 	int left;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(vq->work_lock);
 	left = seq - work->done_seq;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(vq->work_lock);
 	return left <= 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+static void vhost_work_flush(struct vhost_virtqueue *vq,
+			     struct vhost_work *work)
 {
 	unsigned seq;
 	int flushing;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(vq->work_lock);
 	seq = work->queue_seq;
 	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
-	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
+	spin_unlock_irq(vq->work_lock);
+	wait_event(work->done, vhost_work_seq_done(vq, work, seq));
+	spin_lock_irq(vq->work_lock);
 	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(vq->work_lock);
 	BUG_ON(flushing < 0);
 }
 
@@ -134,26 +135,26 @@ static void vhost_work_flush(struct vhos
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_flush(poll->vq, &poll->work);
 }
 
-static inline void vhost_work_queue(struct vhost_dev *dev,
+static inline void vhost_work_queue(struct vhost_virtqueue *vq,
 				    struct vhost_work *work)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->work_lock, flags);
+	spin_lock_irqsave(vq->work_lock, flags);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
+		list_add_tail(&work->node, vq->work_list);
 		work->queue_seq++;
-		wake_up_process(dev->worker);
+		wake_up_process(vq->worker);
 	}
-	spin_unlock_irqrestore(&dev->work_lock, flags);
+	spin_unlock_irqrestore(vq->work_lock, flags);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	vhost_work_queue(poll->dev, &poll->work);
+	vhost_work_queue(poll->vq, &poll->work);
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -188,17 +189,17 @@ static void vhost_vq_reset(struct vhost_
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
+	struct vhost_virtqueue *vq = data;
 	struct vhost_work *work = NULL;
 	unsigned uninitialized_var(seq);
 
-	use_mm(dev->mm);
+	use_mm(vq->dev->mm);
 
 	for (;;) {
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
+		spin_lock_irq(vq->work_lock);
 		if (work) {
 			work->done_seq = seq;
 			if (work->flushing)
@@ -206,18 +207,18 @@ static int vhost_worker(void *data)
 		}
 
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
+			spin_unlock_irq(vq->work_lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
+		if (!list_empty(vq->work_list)) {
+			work = list_first_entry(vq->work_list,
 						struct vhost_work, node);
 			list_del_init(&work->node);
 			seq = work->queue_seq;
 		} else
 			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
+		spin_unlock_irq(vq->work_lock);
 
 		if (work) {
 			__set_current_state(TASK_RUNNING);
@@ -226,7 +227,7 @@ static int vhost_worker(void *data)
 			schedule();
 
 	}
-	unuse_mm(dev->mm);
+	unuse_mm(vq->dev->mm);
 	return 0;
 }
 
@@ -260,7 +261,7 @@ static long vhost_dev_alloc_iovecs(struc
 					  GFP_KERNEL);
 		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
 					    UIO_MAXIOV, GFP_KERNEL);
-		zcopy = vhost_zcopy_mask & (0x1 << i);
+		zcopy = vhost_zcopy_mask & (0x1 << (i & VHOST_NET_TX_VQS));
 		if (zcopy)
 			dev->vqs[i].ubuf_info =
 				kmalloc(sizeof *dev->vqs[i].ubuf_info *
@@ -286,6 +287,30 @@ static void vhost_dev_free_iovecs(struct
 		vhost_vq_free_iovecs(&dev->vqs[i]);
 }
 
+/*
+ * Get index of an existing thread that will handle this rx/tx queue pair.
+ * The same thread handles both rx and tx.
+ */
+static int vhost_get_thread_index(int index)
+{
+	return (index / 2) % MAX_VHOST_THREADS;
+}
+
+/* Get index of the an earlier vq that we can share with */
+static int vhost_get_vq_index(int index)
+{
+	return vhost_get_thread_index(index) * 2;
+}
+
+/*
+ * This is needed to determine whether work_list/work_lock needs
+ * initialization; or to start a new worker thread.
+ */
+static int vhost_needs_init(int i, int j)
+{
+	return i == j * 2;
+}
+
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
@@ -298,21 +323,31 @@ long vhost_dev_init(struct vhost_dev *de
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
-	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		dev->vqs[i].log = NULL;
-		dev->vqs[i].indirect = NULL;
-		dev->vqs[i].heads = NULL;
-		dev->vqs[i].ubuf_info = NULL;
-		dev->vqs[i].dev = dev;
-		mutex_init(&dev->vqs[i].mutex);
+		struct vhost_virtqueue *vq = &dev->vqs[i];
+		int j = vhost_get_thread_index(i);
+
+		if (vhost_needs_init(i, j)) {
+			spin_lock_init(&dev->work[j].work_lock);
+			INIT_LIST_HEAD(&dev->work[j].work_list);
+		}
+
+		vq->ubuf_info = NULL;
+		vq->work_lock = &dev->work[j].work_lock;
+		vq->work_list = &dev->work[j].work_list;
+
+		vq->worker = NULL;
+		vq->qnum = i;
+		vq->log = NULL;
+		vq->indirect = NULL;
+		vq->heads = NULL;
+		vq->dev = dev;
+		mutex_init(&vq->mutex);
 		vhost_vq_reset(dev, dev->vqs + i);
-		if (dev->vqs[i].handle_kick)
-			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick, POLLIN, dev);
+		if (vq->handle_kick)
+			vhost_poll_init(&vq->poll,
+					vq->handle_kick, POLLIN, vq);
 	}
 
 	return 0;
@@ -339,21 +374,83 @@ static void vhost_attach_cgroups_work(st
 	s->ret = cgroup_attach_task_all(s->owner, current);
 }
 
-static int vhost_attach_cgroups(struct vhost_dev *dev)
+static int vhost_attach_cgroups(struct vhost_virtqueue *vq)
 {
 	struct vhost_attach_cgroups_struct attach;
 
 	attach.owner = current;
 	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
+	vhost_work_queue(vq, &attach.work);
+	vhost_work_flush(vq, &attach.work);
 	return attach.ret;
 }
 
+static void __vhost_stop_workers(struct vhost_dev *dev, int nvhosts)
+{
+	int i;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		if (i < nvhosts) {
+			WARN_ON(!list_empty(dev->vqs[i * 2].work_list));
+			if (dev->vqs[i * 2].worker)
+				kthread_stop(dev->vqs[i * 2].worker);
+		}
+		dev->vqs[i].worker = NULL;
+	}
+
+	if (dev->mm)
+		mmput(dev->mm);
+	dev->mm = NULL;
+}
+
+static void vhost_stop_workers(struct vhost_dev *dev)
+{
+	int nthreads = min_t(int, dev->nvqs / 2, MAX_VHOST_THREADS);
+
+	__vhost_stop_workers(dev, nthreads);
+}
+
+static int vhost_start_workers(struct vhost_dev *dev)
+{
+	int i, err;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		struct vhost_virtqueue *vq = &dev->vqs[i];
+		int j = vhost_get_thread_index(i);
+
+		if (vhost_needs_init(i, j)) {
+			/* Start a new thread */
+			vq->worker = kthread_create(vhost_worker, vq,
+						    "vhost-%d-%d",
+						    current->pid, j);
+			if (IS_ERR(vq->worker)) {
+				err = PTR_ERR(vq->worker);
+				goto err;
+			}
+
+			wake_up_process(vq->worker);
+
+			/* avoid contributing to loadavg */
+			err = vhost_attach_cgroups(vq);
+			if (err)
+				goto err;
+		} else {
+			/* Share work with an existing thread */
+			int j = vhost_get_vq_index(i);
+
+			vq->worker = dev->vqs[j].worker;
+		}
+	}
+	return 0;
+
+err:
+	__vhost_stop_workers(dev, i / 2);
+	return err;
+}
+
 /* Caller should have device mutex */
-static long vhost_dev_set_owner(struct vhost_dev *dev)
+static long vhost_dev_set_owner(struct vhost_dev *dev, int numtxqs)
 {
-	struct task_struct *worker;
 	int err;
 
 	/* Is there an owner already? */
@@ -362,33 +459,30 @@ static long vhost_dev_set_owner(struct v
 		goto err_mm;
 	}
 
+	err = vhost_setup_vqs(dev, numtxqs);
+	if (err)
+		goto err_mm;
+
 	/* No owner, become one */
 	dev->mm = get_task_mm(current);
-	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker)) {
-		err = PTR_ERR(worker);
-		goto err_worker;
-	}
-
-	dev->worker = worker;
-	wake_up_process(worker);	/* avoid contributing to loadavg */
 
-	err = vhost_attach_cgroups(dev);
+	/* Start threads */
+	err =  vhost_start_workers(dev);
 	if (err)
-		goto err_cgroup;
+		goto free_vqs;
 
 	err = vhost_dev_alloc_iovecs(dev);
 	if (err)
-		goto err_cgroup;
+		goto clean_workers;
 
 	return 0;
-err_cgroup:
-	kthread_stop(worker);
-	dev->worker = NULL;
-err_worker:
+clean_workers:
+	vhost_stop_workers(dev);
+free_vqs:
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
+	vhost_free_vqs(dev);
 err_mm:
 	return err;
 }
@@ -474,14 +568,7 @@ void vhost_dev_cleanup(struct vhost_dev 
 	kfree(rcu_dereference_protected(dev->memory,
 					lockdep_is_held(&dev->mutex)));
 	RCU_INIT_POINTER(dev->memory, NULL);
-	WARN_ON(!list_empty(&dev->work_list));
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
-	}
-	if (dev->mm)
-		mmput(dev->mm);
-	dev->mm = NULL;
+	vhost_stop_workers(dev);
 }
 
 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -835,7 +922,7 @@ long vhost_dev_ioctl(struct vhost_dev *d
 
 	/* If you are not the owner, you can become one */
 	if (ioctl == VHOST_SET_OWNER) {
-		r = vhost_dev_set_owner(d);
+		r = vhost_dev_set_owner(d, arg);
 		goto done;
 	}
 
diff -ruNp org/drivers/vhost/vhost.h new/drivers/vhost/vhost.h
--- org/drivers/vhost/vhost.h	2011-11-11 16:44:56.000000000 +0530
+++ new/drivers/vhost/vhost.h	2011-11-11 16:45:11.000000000 +0530
@@ -18,6 +18,9 @@
 #define VHOST_DMA_DONE_LEN	1
 #define VHOST_DMA_CLEAR_LEN	0
 
+/* TX vqs are those vq's whose qnum's are odd */
+#define VHOST_NET_TX_VQS	0x1
+
 struct vhost_device;
 
 struct vhost_work;
@@ -40,11 +43,11 @@ struct vhost_poll {
 	wait_queue_t              wait;
 	struct vhost_work	  work;
 	unsigned long		  mask;
-	struct vhost_dev	 *dev;
+	struct vhost_virtqueue	  *vq;  /* points back to vq */
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev);
+		     unsigned long mask, struct vhost_virtqueue *vq);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -141,8 +144,21 @@ struct vhost_virtqueue {
 	/* Reference counting for outstanding ubufs.
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	struct vhost_ubuf_ref *ubufs;
+
+	struct task_struct *worker; /* worker for this vq */
+	spinlock_t *work_lock;	/* points to a dev->work_lock[] entry */
+	struct list_head *work_list;	/* points to a dev->work_list[] entry */
+	int qnum;	/* 0 for RX, 1 for TX, and so on alternatively */
 };
 
+/* work entry and the lock */
+struct work_lock_list {
+	spinlock_t work_lock;
+	struct list_head work_list;
+} ____cacheline_aligned_in_smp;
+
+#define MAX_VHOST_THREADS	4
+
 struct vhost_dev {
 	/* Readers use RCU to access memory table pointer
 	 * log base pointer and features.
@@ -155,11 +171,11 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
-	struct task_struct *worker;
+	struct work_lock_list work[MAX_VHOST_THREADS];
 };
 
+int vhost_setup_vqs(struct vhost_dev *dev, int numtxqs);
+void vhost_free_vqs(struct vhost_dev *dev);
 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
 long vhost_dev_check_owner(struct vhost_dev *);
 long vhost_dev_reset_owner(struct vhost_dev *);

^ permalink raw reply

* [RFC] [ver3 PATCH 2/6] virtio: Move 'num_queues' to virtqueue
From: Krishna Kumar @ 2011-11-11 13:03 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, virtualization, davem, Krishna Kumar, kvm
In-Reply-To: <20111111130223.9878.59517.sendpatchset@krkumar2.in.ibm.com>

Move queue_index from "virtio_net_config" to "virtqueue". This is
needed to figure out the queue number of the vq in the 'done'
handler of the device.

Signed-off-by: krkumar2@in.ibm.com
---
 drivers/virtio/virtio_pci.c |   10 +++-------
 include/linux/virtio.h      |    1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff -ruNp org/drivers/virtio/virtio_pci.c new/drivers/virtio/virtio_pci.c
--- org/drivers/virtio/virtio_pci.c	2011-11-11 16:44:30.000000000 +0530
+++ new/drivers/virtio/virtio_pci.c	2011-11-11 16:44:45.000000000 +0530
@@ -75,9 +75,6 @@ struct virtio_pci_vq_info
 	/* the number of entries in the queue */
 	int num;
 
-	/* the index of the queue */
-	int queue_index;
-
 	/* the virtual address of the ring queue */
 	void *queue;
 
@@ -180,11 +177,10 @@ static void vp_reset(struct virtio_devic
 static void vp_notify(struct virtqueue *vq)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-	struct virtio_pci_vq_info *info = vq->priv;
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(vq->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -380,7 +376,6 @@ static struct virtqueue *setup_vq(struct
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	info->queue_index = index;
 	info->num = num;
 	info->msix_vector = msix_vec;
 
@@ -403,6 +398,7 @@ static struct virtqueue *setup_vq(struct
 		goto out_activate_queue;
 	}
 
+	vq->queue_index = index;
 	vq->priv = info;
 	info->vq = vq;
 
@@ -445,7 +441,7 @@ static void vp_del_vq(struct virtqueue *
 	list_del(&info->node);
 	spin_unlock_irqrestore(&vp_dev->lock, flags);
 
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite16(vq->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
 	if (vp_dev->msix_enabled) {
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
diff -ruNp org/include/linux/virtio.h new/include/linux/virtio.h
--- org/include/linux/virtio.h	2011-11-11 16:44:30.000000000 +0530
+++ new/include/linux/virtio.h	2011-11-11 16:44:45.000000000 +0530
@@ -22,6 +22,7 @@ struct virtqueue {
 	void (*callback)(struct virtqueue *vq);
 	const char *name;
 	struct virtio_device *vdev;
+	int queue_index;	/* the index of the queue */
 	void *priv;
 };
 

^ permalink raw reply

* [RFC] [ver3 PATCH 1/6] virtio_net: Introduce VIRTIO_NET_F_MULTIQUEUE
From: Krishna Kumar @ 2011-11-11 13:02 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, Krishna Kumar, davem, kvm, virtualization
In-Reply-To: <20111111130223.9878.59517.sendpatchset@krkumar2.in.ibm.com>

Introduce VIRTIO_NET_F_MULTIQUEUE. 

Signed-off-by: krkumar2@in.ibm.com
---
 include/linux/virtio_net.h |    1 +
 1 file changed, 1 insertion(+)

diff -ruNp org/include/linux/virtio_net.h new/include/linux/virtio_net.h
--- org/include/linux/virtio_net.h	2011-10-12 10:16:46.000000000 +0530
+++ new/include/linux/virtio_net.h	2011-11-11 16:44:34.000000000 +0530
@@ -49,6 +49,7 @@
 #define VIRTIO_NET_F_CTRL_RX	18	/* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN	19	/* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
+#define VIRTIO_NET_F_MULTIQUEUE	21	/* Device supports multiple TXQ/RXQ */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 


^ permalink raw reply

* [RFC] [ver3 PATCH 0/6] Implement multiqueue virtio-net
From: Krishna Kumar @ 2011-11-11 13:02 UTC (permalink / raw)
  To: rusty, mst; +Cc: netdev, kvm, davem, Krishna Kumar, virtualization

This patch series resurrects the earlier multiple TX/RX queues
functionality for virtio_net, and addresses the issues pointed
out.  It also includes an API to share irq's, f.e.  amongst the
TX vqs. 

I plan to run TCP/UDP STREAM and RR tests for local->host and
local->remote, and send the results in the next couple of days.

patch #1: Introduce VIRTIO_NET_F_MULTIQUEUE
patch #2: Move 'num_queues' to virtqueue
patch #3: virtio_net driver changes
patch #4: vhost_net changes
patch #5: Implement find_vqs_irq()
patch #6: Convert virtio_net driver to use find_vqs_irq()

		Changes from rev2:
Michael:
-------
1. Added functions to handle setting RX/TX/CTRL vq's.
2. num_queue_pairs instead of numtxqs.
3. Experimental support for fewer irq's in find_vqs.

Rusty:
------
4. Cleaned up some existing "while (1)".
5. rvq/svq and rx_sg/tx_sg changed to vq and sg respectively.
6. Cleaned up some "#if 1" code.

Issue when using patch5:
-------------------------

The new API is designed to minimize code duplication.  E.g.
vp_find_vqs() is implemented as:

static int vp_find_vqs(...)
{
	return vp_find_vqs_irq(vdev, nvqs, vqs, callbacks, names, NULL);
}

In my testing, when multiple tx/rx is used with multiple netperf
sessions, all the device tx queues stops a few thousand times and
subsequently woken up by skb_xmit_done.  But after some 40K-50K
iterations of stop/wake, some of the txq's stop and no wake
interrupt comes. (modprobe -r followed by modprobe solves this, so
it is not a system hang).  At the time of the hang (#txqs=#rxqs=4):

# egrep "CPU|virtio0" /proc/interrupts | grep -v config
       CPU0     CPU1     CPU2    CPU3
41:    49057    49262    48828   49421  PCI-MSI-edge    virtio0-input.0
42:    5066     5213     5221    5109   PCI-MSI-edge    virtio0-output.0
43:    43380    43770    43007   43148  PCI-MSI-edge    virtio0-input.1
44:    41433    41727    42101   41175  PCI-MSI-edge    virtio0-input.2
45:    38465    37629    38468   38768  PCI-MSI-edge    virtio0-input.3

# tc -s qdisc show dev eth0
qdisc mq 0: root      
	Sent 393196939897 bytes 271191624 pkt (dropped 59897,
	overlimits 0 requeues 67156) backlog 25375720b 1601p
	requeues 67156  

I am not sure if patch #5 is responsible for the hang.  Also, without
patch #5/patch #6, I changed vp_find_vqs() to:
static int vp_find_vqs(...)
{
	return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
				  false, false);
}
No packets were getting TX'd with this change when #txqs>1.  This is
with the MQ-only patch that doesn't touch drivers/virtio/ directory.

Also, the MQ patch works reasonably well with 2 vectors - with
use_msix=1 and per_vq_vectors=0 in vp_find_vqs().

Patch against net-next - please review.

Signed-off-by: krkumar2@in.ibm.com
---

^ permalink raw reply

* Re: [PATCH 4/4] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
From: Michael S. Tsirkin @ 2011-11-11 12:38 UTC (permalink / raw)
  To: Ian Campbell
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller, Neil Brown,
	J. Bruce Fields, linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1320850927-30240-4-git-send-email-ian.campbell-Sxgqhf6Nn4DQT0dZR+AlfA@public.gmane.org>

On Wed, Nov 09, 2011 at 03:02:07PM +0000, Ian Campbell wrote:
> This prevents an issue where an ACK is delayed, a retransmit is queued (either
> at the RPC or TCP level) and the ACK arrives before the retransmission hits the
> wire. If this happens to an NFS WRITE RPC then the write() system call
> completes and the userspace process can continue, potentially modifying data
> referenced by the retransmission before the retransmission occurs.
> 
> Signed-off-by: Ian Campbell <ian.campbell-Sxgqhf6Nn4DQT0dZR+AlfA@public.gmane.org>
> Acked-by: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
> Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> Cc: Neil Brown <neilb-l3A5Bk7waGM@public.gmane.org>
> Cc: "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
> Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

So this blocks the system call until all page references
are gone, right?
But, there's no upper limit on how long the
page is referenced, correct? consider a bridged setup
with an skb queued at a tap device - this cause one process
to block another one by virtue of not consuming a cloned skb?

> ---
>  include/linux/sunrpc/xdr.h  |    2 ++
>  include/linux/sunrpc/xprt.h |    5 ++++-
>  net/sunrpc/clnt.c           |   27 ++++++++++++++++++++++-----
>  net/sunrpc/svcsock.c        |    3 ++-
>  net/sunrpc/xprt.c           |   13 +++++++++++++
>  net/sunrpc/xprtsock.c       |    3 ++-
>  6 files changed, 45 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
> index a20970e..172f81e 100644
> --- a/include/linux/sunrpc/xdr.h
> +++ b/include/linux/sunrpc/xdr.h
> @@ -16,6 +16,7 @@
>  #include <asm/byteorder.h>
>  #include <asm/unaligned.h>
>  #include <linux/scatterlist.h>
> +#include <linux/skbuff.h>
>  
>  /*
>   * Buffer adjustment
> @@ -57,6 +58,7 @@ struct xdr_buf {
>  			tail[1];	/* Appended after page data */
>  
>  	struct page **	pages;		/* Array of contiguous pages */
> +	struct skb_frag_destructor *destructor;
>  	unsigned int	page_base,	/* Start of page data */
>  			page_len,	/* Length of page data */
>  			flags;		/* Flags for data disposition */
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
> index 15518a1..75131eb 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -92,7 +92,10 @@ struct rpc_rqst {
>  						/* A cookie used to track the
>  						   state of the transport
>  						   connection */
> -	
> +	struct skb_frag_destructor destructor;	/* SKB paged fragment
> +						 * destructor for
> +						 * transmitted pages*/
> +
>  	/*
>  	 * Partial send handling
>  	 */
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index c5347d2..919538d 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -61,6 +61,7 @@ static void	call_reserve(struct rpc_task *task);
>  static void	call_reserveresult(struct rpc_task *task);
>  static void	call_allocate(struct rpc_task *task);
>  static void	call_decode(struct rpc_task *task);
> +static void	call_complete(struct rpc_task *task);
>  static void	call_bind(struct rpc_task *task);
>  static void	call_bind_status(struct rpc_task *task);
>  static void	call_transmit(struct rpc_task *task);
> @@ -1113,6 +1114,8 @@ rpc_xdr_encode(struct rpc_task *task)
>  			 (char *)req->rq_buffer + req->rq_callsize,
>  			 req->rq_rcvsize);
>  
> +	req->rq_snd_buf.destructor = &req->destructor;
> +
>  	p = rpc_encode_header(task);
>  	if (p == NULL) {
>  		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
> @@ -1276,6 +1279,7 @@ call_connect_status(struct rpc_task *task)
>  static void
>  call_transmit(struct rpc_task *task)
>  {
> +	struct rpc_rqst *req = task->tk_rqstp;
>  	dprint_status(task);
>  
>  	task->tk_action = call_status;
> @@ -1309,8 +1313,8 @@ call_transmit(struct rpc_task *task)
>  	call_transmit_status(task);
>  	if (rpc_reply_expected(task))
>  		return;
> -	task->tk_action = rpc_exit_task;
> -	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
> +	task->tk_action = call_complete;
> +	skb_frag_destructor_unref(&req->destructor);
>  }
>  
>  /*
> @@ -1383,7 +1387,8 @@ call_bc_transmit(struct rpc_task *task)
>  		return;
>  	}
>  
> -	task->tk_action = rpc_exit_task;
> +	task->tk_action = call_complete;
> +	skb_frag_destructor_unref(&req->destructor);
>  	if (task->tk_status < 0) {
>  		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
>  			"error: %d\n", task->tk_status);
> @@ -1423,7 +1428,6 @@ call_bc_transmit(struct rpc_task *task)
>  			"error: %d\n", task->tk_status);
>  		break;
>  	}
> -	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
>  }
>  #endif /* CONFIG_SUNRPC_BACKCHANNEL */
>  
> @@ -1589,12 +1593,14 @@ call_decode(struct rpc_task *task)
>  		return;
>  	}
>  
> -	task->tk_action = rpc_exit_task;
> +	task->tk_action = call_complete;
>  
>  	if (decode) {
>  		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
>  						      task->tk_msg.rpc_resp);
>  	}
> +	rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
> +	skb_frag_destructor_unref(&req->destructor);
>  	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
>  			task->tk_status);
>  	return;
> @@ -1609,6 +1615,17 @@ out_retry:
>  	}
>  }
>  
> +/*
> + * 8.	Wait for pages to be released by the network stack.
> + */
> +static void
> +call_complete(struct rpc_task *task)
> +{
> +	dprintk("RPC: %5u call_complete result %d\n",
> +		task->tk_pid, task->tk_status);
> +	task->tk_action = rpc_exit_task;
> +}
> +
>  static __be32 *
>  rpc_encode_header(struct rpc_task *task)
>  {
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 852a258..3685cad 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -196,7 +196,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
>  	while (pglen > 0) {
>  		if (slen == size)
>  			flags = 0;
> -		result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
> +		result = kernel_sendpage(sock, *ppage, xdr->destructor,
> +					 base, size, flags);
>  		if (result > 0)
>  			len += result;
>  		if (result != size)
> diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
> index f4385e4..925aa0c 100644
> --- a/net/sunrpc/xprt.c
> +++ b/net/sunrpc/xprt.c
> @@ -1103,6 +1103,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
>  	xprt->xid = net_random();
>  }
>  
> +static int xprt_complete_skb_pages(void *calldata)
> +{
> +	struct rpc_task *task = calldata;
> +	struct rpc_rqst	*req = task->tk_rqstp;
> +
> +	dprintk("RPC: %5u completing skb pages\n", task->tk_pid);
> +	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
> +	return 0;
> +}
> +
>  static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>  {
>  	struct rpc_rqst	*req = task->tk_rqstp;
> @@ -1115,6 +1125,9 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
>  	req->rq_xid     = xprt_alloc_xid(xprt);
>  	req->rq_release_snd_buf = NULL;
>  	xprt_reset_majortimeo(req);
> +	atomic_set(&req->destructor.ref, 1);
> +	req->destructor.destroy = &xprt_complete_skb_pages;
> +	req->destructor.data = task;
>  	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
>  			req, ntohl(req->rq_xid));
>  }
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index f79e40e9..af3a106 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>  		remainder -= len;
>  		if (remainder != 0 || more)
>  			flags |= MSG_MORE;
> -		err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
> +		err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
> +					  base, len, flags);
>  		if (remainder == 0 || err != len)
>  			break;
>  		sent += err;
> -- 
> 1.7.2.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V2 01/14] clk: add helper functions clk_prepare_enable and clk_disable_unprepare
From: Sascha Hauer @ 2011-11-11 11:56 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Richard Zhao, amit.kucheria, kernel, netdev, linux-mmc, ben-linux,
	eric.miao, linux-i2c, linux-serial, cjb, linux-arm-kernel, alan
In-Reply-To: <20111111102727.GC12913@n2100.arm.linux.org.uk>

On Fri, Nov 11, 2011 at 10:27:27AM +0000, Russell King - ARM Linux wrote:
> On Fri, Nov 11, 2011 at 10:15:56AM +0100, Sascha Hauer wrote:
> > On Fri, Nov 11, 2011 at 05:05:47PM +0800, Richard Zhao wrote:
> > > {
> > > 	int ret;
> > > 
> > > 	ret = clk_prepare(clk);
> > > 	if (ret)
> > > 		return ret;
> > > 	ret = clk_enable(clk);
> > > 	if (ret)
> > > 		clk_unprepare(clk);
> > > 	return ret;
> > 
> > Yes, looks good.
> 
> While this looks like a nice easy solution for converting existing
> drivers, I'd suggest thinking about this a little more...
> 
> I would suggest some thought is given to the placement of clk_enable()
> and clk_disable() when adding clk_prepare(), especially if your existing
> clk_enable() function can only be called from non-atomic contexts.
> 
> Obviously, the transition path needs to be along these lines:
> 
> 1. add clk_prepare() to drivers
> 2. implement clk_prepare() and make clk_enable() callable from non-atomic
>    contexts
> 3. move clk_enable() in drivers to places it can be called from non-atomic
>    contexts to achieve greater power savings (maybe via the runtime pm)
> 
> and where a driver is shared between different sub-architectures which
> have non-atomic clk_enable()s, (3) can only happen when all those sub-
> architectures have been updated to step (2).

The drivers changed here all do clk_prepare/enable in their probe
function. I agree that this clk_prepare_enable patch gives kind of
wrong motivation to just use this function and to forget about
potential power savings with proper integration of clk_prepare/enable.
I think though that it will take a long time until all drivers really
do this no matter if we have such a helper or not. I think that in the
meantime it's better to have a little helper than to clobber the probe
code with additional error handling.

Sascha

-- 
Pengutronix e.K.                           |                             |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply

* Re: creating netdev queues on the fly?
From: Eric Dumazet @ 2011-11-11 11:54 UTC (permalink / raw)
  To: Dave Taht
  Cc: Denys Fedoryshchenko, Helmut Schaa, Johannes Berg, netdev,
	linux-wireless, Andrew McGregor
In-Reply-To: <CAA93jw7n1jYiWrnHOF0Zmzd0cVtadNhPSCpP5YqEdq_Q9opw5A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Le vendredi 11 novembre 2011 à 12:42 +0100, Dave Taht a écrit :

> More elegant to be sure. But for N you kind of need to do ewma between
> aggregated transmission groups.
> 

I speak at Class/Qdisc level. If each transmission group has its own
class [ I believe it should ], it should fit.


> I still don't 'get' how we can split out a stream based on a
> stationid, toss it in a queue to be further scheduled (my choice would
> be QFQ, btw), and then sanely de-schedule a burst of packets for that
> destination appropriate for that station's aggregation level and
> transmit rate using existing tc methods.
> 

Right now its not possible since we dont have a feedback once a packet
is dequeued from qdisc. But it should be doable.

> I liked the callback idea discussed earlier for implementing a
> 'grouper' of this sort.
> 
> That said I'm strongly encouraged by the dialog thus far on this thread.
> 

...

> and I assume that you are either making this syntax up or coding
> faster than emailing in some tree somewhere...
> 

That because I prefer discussing on this before starting coding once
general idea is accepted.

Note this delay idea is not new, it already was mentioned on netdev some
months ago.



--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: creating netdev queues on the fly?
From: Dave Taht @ 2011-11-11 11:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Denys Fedoryshchenko, Helmut Schaa, Johannes Berg, netdev,
	linux-wireless, Andrew McGregor
In-Reply-To: <1321009374.2548.31.camel@edumazet-laptop>

On Fri, Nov 11, 2011 at 12:02 PM, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Le jeudi 10 novembre 2011 à 16:25 +0100, Dave Taht a écrit :
>
>
>
>> Two notes:
>>
>> 1) Getting 'time' from the kernel is expensive. And: System time wanders.
>>
>> mac80211 Wifi devices however do export a get_tsf function which could
>> be used as a relative-to-the-queue clock - and we actually don't need
>> accuracy down to the level of get_tsf (25ns)
>>
>
>
> getting 'time' from kernel is not that expensive, depending on
> resolution you need. We are not going to use timestamps from devices !

Wireless keeps track of time via a TSF function.

http://en.wikipedia.org/wiki/Timing_Synchronization_Function_%28TSF%29

TSF wanders forward based on an election process from broadcast beacons,
the presentation below contains animations and the like that describe
how it works

http://www.cse.ohio-state.edu/~lai/788-Au05/2-scalibility.ppt

Further it copes with life in terms of TU. One tu is 1024us.

Now, whether any of that is truly relevant to how to do packet
scheduling well on wireless, is a good question. TU is fairly
fundamental to the minstrel algorithm. It bothers me to have anything
resembling a beat frequency.

>
> If ms resolution is enough (say you want to drop packets if they stay
> more than 100ms in qdisc), jiffies is a single memory read.

The intended properties of the hardware queues are 10 ms VO, 100 ms
VI, and some_sane_number_relative_to_real_bandwidth for BE and BK.

The software queues should make similar assumptions, although for
voice traffic, being able to drop and 'pull forward' the next packet
in a stream should it exceed time in queue would be good, and you
could go with 30 ms in queue with that level of jitter.

>
> If needing us or ns resolution, psched_get_time() uses ktime_get(), and
> is used in CBQ, HTB, HFSC, TBF, so if it was expensive we would have big
> problem right now :)
>
> 2) You need to get a timestamp on entry to the first queue and check
>> against the allowable latency on exit from the last. So to construct a
>> tc chain you'd want a tfifo (timestamp on entry), fifot (check
>> timestamp against limit on dequeue), and for the simplest of
>> applications : tfifot (timestamp on entry, check on exit)
>
> That would be not very practical.
>
> I would see a new Qdisc/Class property, like the rate estimator, that we
> can attach to any Qdisc/Class with a new tc option.
>
> Even without any limit enforcing (might be Random Early Detection by the
> way), it could be used to get a Queue Delay estimation, using EWMA
>
> avqdelay = avqdelay*(1-W) + qdelay*W;
> W = 2^(-ewma_log);
>
> tc [ qdisc | class] add [...] [est 1sec 8sec] [delayest ewma_log ] ..

More elegant to be sure. But for N you kind of need to do ewma between
aggregated transmission groups.

>
> tc -s -d qdisc ...
> qdisc htb 1: root refcnt 2 r2q 10 default 1 direct_packets_stat 0 ver 3.17
>  Sent 3596219 bytes 2567 pkt (dropped 238, overlimits 3797 requeues 0)
>  rate 2557Kbit 215pps backlog 0b 0p requeues 0
>  delay 91ms

I still don't 'get' how we can split out a stream based on a
stationid, toss it in a queue to be further scheduled (my choice would
be QFQ, btw), and then sanely de-schedule a burst of packets for that
destination appropriate for that station's aggregation level and
transmit rate using existing tc methods.

I liked the callback idea discussed earlier for implementing a
'grouper' of this sort.

That said I'm strongly encouraged by the dialog thus far on this thread.

>
>
>
> tc [ qdisc | class] add [...] [est 1sec 8sec] [delaylimit max ] ..
>
> tc -s -d qdisc ...
> qdisc htb 1: root refcnt 2 r2q 10 default 1 direct_packets_stat 0 ver 3.17
>  Sent 3596219 bytes 2567 pkt (dropped 238, overlimits 3797 requeues 0)
>  rate 2557Kbit 215pps backlog 0b 0p requeues 0
>  delay 91ms delaylimit 100ms (dropped 12)

and I assume that you are either making this syntax up or coding
faster than emailing in some tree somewhere...

>
>
>
>

-- 
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: creating netdev queues on the fly?
From: Eric Dumazet @ 2011-11-11 11:02 UTC (permalink / raw)
  To: Dave Taht
  Cc: Denys Fedoryshchenko, Helmut Schaa, Johannes Berg, netdev,
	linux-wireless
In-Reply-To: <CAA93jw7ECQegWj6rpd48sbmDQUjorCYzXANXJSj0baHtxzC7EA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Le jeudi 10 novembre 2011 à 16:25 +0100, Dave Taht a écrit :



> Two notes:
> 
> 1) Getting 'time' from the kernel is expensive. And: System time wanders.
> 
> mac80211 Wifi devices however do export a get_tsf function which could
> be used as a relative-to-the-queue clock - and we actually don't need
> accuracy down to the level of get_tsf (25ns)
> 


getting 'time' from kernel is not that expensive, depending on
resolution you need. We are not going to use timestamps from devices !

If ms resolution is enough (say you want to drop packets if they stay
more than 100ms in qdisc), jiffies is a single memory read.

If needing us or ns resolution, psched_get_time() uses ktime_get(), and
is used in CBQ, HTB, HFSC, TBF, so if it was expensive we would have big
problem right now :)

2) You need to get a timestamp on entry to the first queue and check
> against the allowable latency on exit from the last. So to construct a
> tc chain you'd want a tfifo (timestamp on entry), fifot (check
> timestamp against limit on dequeue), and for the simplest of
> applications : tfifot (timestamp on entry, check on exit)

That would be not very practical.

I would see a new Qdisc/Class property, like the rate estimator, that we
can attach to any Qdisc/Class with a new tc option.

Even without any limit enforcing (might be Random Early Detection by the
way), it could be used to get a Queue Delay estimation, using EWMA

avqdelay = avqdelay*(1-W) + qdelay*W;
W = 2^(-ewma_log);

tc [ qdisc | class] add [...] [est 1sec 8sec] [delayest ewma_log ] ..

tc -s -d qdisc ...
qdisc htb 1: root refcnt 2 r2q 10 default 1 direct_packets_stat 0 ver 3.17
 Sent 3596219 bytes 2567 pkt (dropped 238, overlimits 3797 requeues 0) 
 rate 2557Kbit 215pps backlog 0b 0p requeues 0 
 delay 91ms



tc [ qdisc | class] add [...] [est 1sec 8sec] [delaylimit max ] ..

tc -s -d qdisc ...
qdisc htb 1: root refcnt 2 r2q 10 default 1 direct_packets_stat 0 ver 3.17
 Sent 3596219 bytes 2567 pkt (dropped 238, overlimits 3797 requeues 0) 
 rate 2557Kbit 215pps backlog 0b 0p requeues 0 
 delay 91ms delaylimit 100ms (dropped 12)



--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH V3 10/14] ARM: mxs: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 arch/arm/mach-mxs/system.c |    2 +-
 arch/arm/mach-mxs/timer.c  |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-mxs/system.c b/arch/arm/mach-mxs/system.c
index 20ec3bd..9760a12 100644
--- a/arch/arm/mach-mxs/system.c
+++ b/arch/arm/mach-mxs/system.c
@@ -66,7 +66,7 @@ static int __init mxs_arch_reset_init(void)
 
 	clk = clk_get_sys("rtc", NULL);
 	if (!IS_ERR(clk))
-		clk_enable(clk);
+		clk_prepare_enable(clk);
 
 	return 0;
 }
diff --git a/arch/arm/mach-mxs/timer.c b/arch/arm/mach-mxs/timer.c
index cace0d2..564a632 100644
--- a/arch/arm/mach-mxs/timer.c
+++ b/arch/arm/mach-mxs/timer.c
@@ -245,7 +245,7 @@ static int __init mxs_clocksource_init(struct clk *timer_clk)
 
 void __init mxs_timer_init(struct clk *timer_clk, int irq)
 {
-	clk_enable(timer_clk);
+	clk_prepare_enable(timer_clk);
 
 	/*
 	 * Initialize timers to a known state
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 07/14] ARM: mxc: audmux-v2: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 arch/arm/plat-mxc/audmux-v2.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/plat-mxc/audmux-v2.c b/arch/arm/plat-mxc/audmux-v2.c
index 8cced35..0e51fc3 100644
--- a/arch/arm/plat-mxc/audmux-v2.c
+++ b/arch/arm/plat-mxc/audmux-v2.c
@@ -73,13 +73,13 @@ static ssize_t audmux_read_file(struct file *file, char __user *user_buf,
 		return -ENOMEM;
 
 	if (audmux_clk)
-		clk_enable(audmux_clk);
+		clk_prepare_enable(audmux_clk);
 
 	ptcr = readl(audmux_base + MXC_AUDMUX_V2_PTCR(port));
 	pdcr = readl(audmux_base + MXC_AUDMUX_V2_PDCR(port));
 
 	if (audmux_clk)
-		clk_disable(audmux_clk);
+		clk_disable_unprepare(audmux_clk);
 
 	ret = snprintf(buf, PAGE_SIZE, "PDCR: %08x\nPTCR: %08x\n",
 		       pdcr, ptcr);
@@ -172,13 +172,13 @@ int mxc_audmux_v2_configure_port(unsigned int port, unsigned int ptcr,
 		return -ENOSYS;
 
 	if (audmux_clk)
-		clk_enable(audmux_clk);
+		clk_prepare_enable(audmux_clk);
 
 	writel(ptcr, audmux_base + MXC_AUDMUX_V2_PTCR(port));
 	writel(pdcr, audmux_base + MXC_AUDMUX_V2_PDCR(port));
 
 	if (audmux_clk)
-		clk_disable(audmux_clk);
+		clk_disable_unprepare(audmux_clk);
 
 	return 0;
 }
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 00/14] add clk_prepare/clk_unprepare to imx drivers
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao

Changes since V2:
 - fix clk_prepare_enable bug

Changes since V1:
 - Add common helper functions clk_prepare_enable/clk_disable_unprepare
 - serial/imx: move clk_disable_unprepare before clk_put

Richard Zhao (14):
      clk: add helper functions clk_prepare_enable and clk_disable_unprepare
      ARM: mxc: time: add clk_prepare/clk_unprepare
      ARM: mxc: ahci: add clk_prepare/clk_unprepare
      ARM: mxc: pwm: add clk_prepare/clk_unprepare
      ARM: mxc: epit: add clk_prepare/clk_unprepare
      ARM: mxc: arch_reset: add clk_prepare/clk_unprepare
      ARM: mxc: audmux-v2: add clk_prepare/clk_unprepare
      ARM: pm-imx5: add clk_prepare/clk_unprepare
      ARM: mx31moboard: add clk_prepare/clk_unprepare
      ARM: mxs: add clk_prepare/clk_unprepare
      serial: imx: add clk_prepare/clk_unprepare
      net: fec: add clk_prepare/clk_unprepare
      i2c: imx: add clk_prepare/clk_unprepare
      mmc: sdhci-esdhc-imx: add clk_prepare/clk_unprepare

 arch/arm/mach-imx/mach-mx31moboard.c          |    2 +-
 arch/arm/mach-mx5/pm-imx5.c                   |    4 ++--
 arch/arm/mach-mxs/system.c                    |    2 +-
 arch/arm/mach-mxs/timer.c                     |    2 +-
 arch/arm/plat-mxc/audmux-v2.c                 |    8 ++++----
 arch/arm/plat-mxc/devices/platform-ahci-imx.c |   16 ++++++++--------
 arch/arm/plat-mxc/epit.c                      |    2 +-
 arch/arm/plat-mxc/pwm.c                       |    4 ++--
 arch/arm/plat-mxc/system.c                    |    2 +-
 arch/arm/plat-mxc/time.c                      |    2 +-
 drivers/i2c/busses/i2c-imx.c                  |    4 ++--
 drivers/mmc/host/sdhci-esdhc-imx.c            |    6 +++---
 drivers/net/ethernet/freescale/fec.c          |   10 +++++-----
 drivers/tty/serial/imx.c                      |    7 +++----
 include/linux/clk.h                           |   20 ++++++++++++++++++++
 15 files changed, 55 insertions(+), 36 deletions(-)

Thanks
Richard

^ permalink raw reply

* [PATCH V3 14/14] mmc: sdhci-esdhc-imx: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 drivers/mmc/host/sdhci-esdhc-imx.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index ae57769..f1ae37e 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -462,7 +462,7 @@ static int __devinit sdhci_esdhc_imx_probe(struct platform_device *pdev)
 		err = PTR_ERR(clk);
 		goto err_clk_get;
 	}
-	clk_enable(clk);
+	clk_prepare_enable(clk);
 	pltfm_host->clk = clk;
 
 	if (!is_imx25_esdhc(imx_data))
@@ -550,7 +550,7 @@ no_card_detect_irq:
 		gpio_free(boarddata->wp_gpio);
 no_card_detect_pin:
 no_board_data:
-	clk_disable(pltfm_host->clk);
+	clk_disable_unprepare(pltfm_host->clk);
 	clk_put(pltfm_host->clk);
 err_clk_get:
 	kfree(imx_data);
@@ -577,7 +577,7 @@ static int __devexit sdhci_esdhc_imx_remove(struct platform_device *pdev)
 		gpio_free(boarddata->cd_gpio);
 	}
 
-	clk_disable(pltfm_host->clk);
+	clk_disable_unprepare(pltfm_host->clk);
 	clk_put(pltfm_host->clk);
 	kfree(imx_data);
 
-- 
1.7.5.4



^ permalink raw reply related

* [PATCH V3 13/14] i2c: imx: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 drivers/i2c/busses/i2c-imx.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 58832e5..8d1ab6f 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -196,7 +196,7 @@ static int i2c_imx_start(struct imx_i2c_struct *i2c_imx)
 
 	dev_dbg(&i2c_imx->adapter.dev, "<%s>\n", __func__);
 
-	clk_enable(i2c_imx->clk);
+	clk_prepare_enable(i2c_imx->clk);
 	writeb(i2c_imx->ifdr, i2c_imx->base + IMX_I2C_IFDR);
 	/* Enable I2C controller */
 	writeb(0, i2c_imx->base + IMX_I2C_I2SR);
@@ -245,7 +245,7 @@ static void i2c_imx_stop(struct imx_i2c_struct *i2c_imx)
 
 	/* Disable I2C controller */
 	writeb(0, i2c_imx->base + IMX_I2C_I2CR);
-	clk_disable(i2c_imx->clk);
+	clk_disable_unprepare(i2c_imx->clk);
 }
 
 static void __init i2c_imx_set_clk(struct imx_i2c_struct *i2c_imx,
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 12/14] net: fec: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 drivers/net/ethernet/freescale/fec.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c
index 1124ce0..e96fa44 100644
--- a/drivers/net/ethernet/freescale/fec.c
+++ b/drivers/net/ethernet/freescale/fec.c
@@ -1588,7 +1588,7 @@ fec_probe(struct platform_device *pdev)
 		ret = PTR_ERR(fep->clk);
 		goto failed_clk;
 	}
-	clk_enable(fep->clk);
+	clk_prepare_enable(fep->clk);
 
 	ret = fec_enet_init(ndev);
 	if (ret)
@@ -1611,7 +1611,7 @@ failed_register:
 	fec_enet_mii_remove(fep);
 failed_mii_init:
 failed_init:
-	clk_disable(fep->clk);
+	clk_disable_unprepare(fep->clk);
 	clk_put(fep->clk);
 failed_clk:
 	for (i = 0; i < FEC_IRQ_NUM; i++) {
@@ -1638,7 +1638,7 @@ fec_drv_remove(struct platform_device *pdev)
 
 	fec_stop(ndev);
 	fec_enet_mii_remove(fep);
-	clk_disable(fep->clk);
+	clk_disable_unprepare(fep->clk);
 	clk_put(fep->clk);
 	iounmap(fep->hwp);
 	unregister_netdev(ndev);
@@ -1664,7 +1664,7 @@ fec_suspend(struct device *dev)
 		fec_stop(ndev);
 		netif_device_detach(ndev);
 	}
-	clk_disable(fep->clk);
+	clk_disable_unprepare(fep->clk);
 
 	return 0;
 }
@@ -1675,7 +1675,7 @@ fec_resume(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct fec_enet_private *fep = netdev_priv(ndev);
 
-	clk_enable(fep->clk);
+	clk_prepare_enable(fep->clk);
 	if (netif_running(ndev)) {
 		fec_restart(ndev, fep->full_duplex);
 		netif_device_attach(ndev);
-- 
1.7.5.4



^ permalink raw reply related

* [PATCH V3 11/14] serial: imx: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 drivers/tty/serial/imx.c |    7 +++----
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 163fc90..8f3709f 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -1390,7 +1390,7 @@ static int serial_imx_probe(struct platform_device *pdev)
 		ret = PTR_ERR(sport->clk);
 		goto unmap;
 	}
-	clk_enable(sport->clk);
+	clk_prepare_enable(sport->clk);
 
 	sport->port.uartclk = clk_get_rate(sport->clk);
 
@@ -1413,8 +1413,8 @@ deinit:
 	if (pdata && pdata->exit)
 		pdata->exit(pdev);
 clkput:
+	clk_disable_unprepare(sport->clk);
 	clk_put(sport->clk);
-	clk_disable(sport->clk);
 unmap:
 	iounmap(sport->port.membase);
 free:
@@ -1434,11 +1434,10 @@ static int serial_imx_remove(struct platform_device *pdev)
 
 	if (sport) {
 		uart_remove_one_port(&imx_reg, &sport->port);
+		clk_disable_unprepare(sport->clk);
 		clk_put(sport->clk);
 	}
 
-	clk_disable(sport->clk);
-
 	if (pdata && pdata->exit)
 		pdata->exit(pdev);
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 09/14] ARM: mx31moboard: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 arch/arm/mach-imx/mach-mx31moboard.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index 07034f4..d1a9337 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -505,7 +505,7 @@ static void mx31moboard_poweroff(void)
 	struct clk *clk = clk_get_sys("imx2-wdt.0", NULL);
 
 	if (!IS_ERR(clk))
-		clk_enable(clk);
+		clk_prepare_enable(clk);
 
 	mxc_iomux_mode(MX31_PIN_WATCHDOG_RST__WATCHDOG_RST);
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 08/14] ARM: pm-imx5: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 arch/arm/mach-mx5/pm-imx5.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-mx5/pm-imx5.c b/arch/arm/mach-mx5/pm-imx5.c
index 98052fc..2bdc85f 100644
--- a/arch/arm/mach-mx5/pm-imx5.c
+++ b/arch/arm/mach-mx5/pm-imx5.c
@@ -22,7 +22,7 @@ static struct clk *gpc_dvfs_clk;
 
 static int mx5_suspend_prepare(void)
 {
-	return clk_enable(gpc_dvfs_clk);
+	return clk_prepare_enable(gpc_dvfs_clk);
 }
 
 static int mx5_suspend_enter(suspend_state_t state)
@@ -52,7 +52,7 @@ static int mx5_suspend_enter(suspend_state_t state)
 
 static void mx5_suspend_finish(void)
 {
-	clk_disable(gpc_dvfs_clk);
+	clk_disable_unprepare(gpc_dvfs_clk);
 }
 
 static int mx5_pm_valid(suspend_state_t state)
-- 
1.7.5.4



^ permalink raw reply related

* [PATCH V3 06/14] ARM: mxc: arch_reset: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-i2c, linux-mmc, netdev, linux-serial
  Cc: linux, amit.kucheria, kernel, ben-linux, cjb, alan, eric.miao,
	Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao@linaro.org>

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 arch/arm/plat-mxc/system.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/arm/plat-mxc/system.c b/arch/arm/plat-mxc/system.c
index 9dad8dc..a667adc 100644
--- a/arch/arm/plat-mxc/system.c
+++ b/arch/arm/plat-mxc/system.c
@@ -54,7 +54,7 @@ void arch_reset(char mode, const char *cmd)
 
 		clk = clk_get_sys("imx2-wdt.0", NULL);
 		if (!IS_ERR(clk))
-			clk_enable(clk);
+			clk_prepare_enable(clk);
 		wcr_enable = (1 << 2);
 	}
 
-- 
1.7.5.4



^ permalink raw reply related

* [PATCH V3 05/14] ARM: mxc: epit: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 arch/arm/plat-mxc/epit.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/arm/plat-mxc/epit.c b/arch/arm/plat-mxc/epit.c
index d3467f8..9129c9e 100644
--- a/arch/arm/plat-mxc/epit.c
+++ b/arch/arm/plat-mxc/epit.c
@@ -203,7 +203,7 @@ static int __init epit_clockevent_init(struct clk *timer_clk)
 
 void __init epit_timer_init(struct clk *timer_clk, void __iomem *base, int irq)
 {
-	clk_enable(timer_clk);
+	clk_prepare_enable(timer_clk);
 
 	timer_base = base;
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 04/14] ARM: mxc: pwm: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 arch/arm/plat-mxc/pwm.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/plat-mxc/pwm.c b/arch/arm/plat-mxc/pwm.c
index 42d74ea..68c7199b 100644
--- a/arch/arm/plat-mxc/pwm.c
+++ b/arch/arm/plat-mxc/pwm.c
@@ -118,7 +118,7 @@ int pwm_enable(struct pwm_device *pwm)
 	int rc = 0;
 
 	if (!pwm->clk_enabled) {
-		rc = clk_enable(pwm->clk);
+		rc = clk_prepare_enable(pwm->clk);
 		if (!rc)
 			pwm->clk_enabled = 1;
 	}
@@ -131,7 +131,7 @@ void pwm_disable(struct pwm_device *pwm)
 	writel(0, pwm->mmio_base + MX3_PWMCR);
 
 	if (pwm->clk_enabled) {
-		clk_disable(pwm->clk);
+		clk_disable_unprepare(pwm->clk);
 		pwm->clk_enabled = 0;
 	}
 }
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH V3 03/14] ARM: mxc: ahci: add clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-11 10:50 UTC (permalink / raw)
  To: linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-serial-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-lFZ/pmaqli7XmaaqVzeoHQ,
	amit.kucheria-Z7WLFzj8eWMS+FvcfC7Uqw,
	kernel-bIcnvbaLZ9MEGnE8C9+IrQ, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
	cjb-2X9k7bc8m7Mdnm+yROfE0A, alan-VuQAYsv1563Yd54FQh9/CA,
	eric.miao-QSEj5FYQhm4dnm+yROfE0A, Richard Zhao
In-Reply-To: <1321008637-19999-1-git-send-email-richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>

Signed-off-by: Richard Zhao <richard.zhao-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 arch/arm/plat-mxc/devices/platform-ahci-imx.c |   16 ++++++++--------
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/plat-mxc/devices/platform-ahci-imx.c b/arch/arm/plat-mxc/devices/platform-ahci-imx.c
index d8a56ae..ade4a1c 100644
--- a/arch/arm/plat-mxc/devices/platform-ahci-imx.c
+++ b/arch/arm/plat-mxc/devices/platform-ahci-imx.c
@@ -60,9 +60,9 @@ static int imx_sata_init(struct device *dev, void __iomem *addr)
 		dev_err(dev, "no sata clock.\n");
 		return PTR_ERR(sata_clk);
 	}
-	ret = clk_enable(sata_clk);
+	ret = clk_prepare_enable(sata_clk);
 	if (ret) {
-		dev_err(dev, "can't enable sata clock.\n");
+		dev_err(dev, "can't prepare/enable sata clock.\n");
 		goto put_sata_clk;
 	}
 
@@ -73,9 +73,9 @@ static int imx_sata_init(struct device *dev, void __iomem *addr)
 		ret = PTR_ERR(sata_ref_clk);
 		goto release_sata_clk;
 	}
-	ret = clk_enable(sata_ref_clk);
+	ret = clk_prepare_enable(sata_ref_clk);
 	if (ret) {
-		dev_err(dev, "can't enable sata ref clock.\n");
+		dev_err(dev, "can't prepare/enable sata ref clock.\n");
 		goto put_sata_ref_clk;
 	}
 
@@ -104,11 +104,11 @@ static int imx_sata_init(struct device *dev, void __iomem *addr)
 	return 0;
 
 release_sata_ref_clk:
-	clk_disable(sata_ref_clk);
+	clk_disable_unprepare(sata_ref_clk);
 put_sata_ref_clk:
 	clk_put(sata_ref_clk);
 release_sata_clk:
-	clk_disable(sata_clk);
+	clk_disable_unprepare(sata_clk);
 put_sata_clk:
 	clk_put(sata_clk);
 
@@ -117,10 +117,10 @@ put_sata_clk:
 
 static void imx_sata_exit(struct device *dev)
 {
-	clk_disable(sata_ref_clk);
+	clk_disable_unprepare(sata_ref_clk);
 	clk_put(sata_ref_clk);
 
-	clk_disable(sata_clk);
+	clk_disable_unprepare(sata_clk);
 	clk_put(sata_clk);
 
 }
-- 
1.7.5.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox