[PATCH 3/3] NET: [SCHED] Qdisc changes and sch

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-21 21:26 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
@ 2007-06-21 21:26 ` PJ Waskiewicz
  2007-06-21 23:47   ` Patrick McHardy
  0 siblings, 1 reply; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-21 21:26 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, kaber, hadi

Add the new sch_rr qdisc for multiqueue network device support.
Allow sch_prio to be compiled with or without multiqueue hardware
support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 net/sched/Kconfig       |   32 ++++++++++++
 net/sched/sch_generic.c |    3 +
 net/sched/sch_prio.c    |  123 ++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..ca0b352 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -102,8 +102,16 @@ config NET_SCH_ATM
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_atm.
 
+config NET_SCH_BANDS
+        bool "Multi Band Queueing (PRIO and RR)"
+        ---help---
+          Say Y here if you want to use n-band multiqueue packet
+          schedulers.  These include a priority-based scheduler and
+	   a round-robin scheduler.
+
 config NET_SCH_PRIO
 	tristate "Multi Band Priority Queueing (PRIO)"
+	depends on NET_SCH_BANDS
 	---help---
 	  Say Y here if you want to use an n-band priority queue packet
 	  scheduler.
@@ -111,6 +119,30 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_PRIO_MQ
+	bool "Multiple hardware queue support for PRIO"
+	depends on NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to allow the PRIO qdisc to assign
+	  flows to multiple hardware queues on an ethernet device.  This
+	  will still work on devices with 1 queue.
+
+	  Consider this scheduler for devices that do not use
+	  hardware-based scheduling policies.  Otherwise, use NET_SCH_RR.
+
+	  Most people will say N here.
+
+config NET_SCH_RR
+	bool "Multi Band Round Robin Queuing (RR)"
+	depends on NET_SCH_BANDS && NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 9461e8a..203d5c4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -168,7 +168,8 @@ static inline int qdisc_restart(struct net_device *dev)
 	spin_unlock(&dev->queue_lock);
 
 	ret = NETDEV_TX_BUSY;
-	if (!netif_queue_stopped(dev))
+	if (!netif_queue_stopped(dev) &&
+	    !netif_subqueue_stopped(dev, skb->queue_mapping))
 		/* churn baby churn .. */
 		ret = dev_hard_start_xmit(skb, dev);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6d7542c..4eb3ba5 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -9,6 +9,8 @@
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>:
  *              Init --  EINVAL when opt undefined
+ * Additions:	Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
+ *		Added round-robin scheduling for selection at load-time
  */
 
 #include <linux/module.h>
@@ -40,9 +42,13 @@
 struct prio_sched_data
 {
 	int bands;
+#ifdef CONFIG_NET_SCH_RR
+	int curband; /* for round-robin */
+#endif
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+	u16 band2queue[TC_PRIO_MAX + 1];
 };
 
 
@@ -70,14 +76,19 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
+			skb->queue_mapping =
+				q->band2queue[q->prio2band[band&TC_PRIO_MAX]];
 			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
-	if (band >= q->bands)
+	if (band >= q->bands) {
+ 		skb->queue_mapping = q->band2queue[q->prio2band[0]];
 		return q->queues[q->prio2band[0]];
+	}
 
+ 	skb->queue_mapping = q->band2queue[band];
 	return q->queues[band];
 }
 
@@ -144,17 +155,59 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, q->band2queue[prio])) {
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
 		}
 	}
 	return NULL;
 
 }
 
+#ifdef CONFIG_NET_SCH_RR
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, q->band2queue[q->curband])) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+		}
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+#endif
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -200,6 +253,7 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 	struct prio_sched_data *q = qdisc_priv(sch);
 	struct tc_prio_qopt *qopt = RTA_DATA(opt);
 	int i;
+	int queue;
 
 	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
 		return -EINVAL;
@@ -211,6 +265,22 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 			return -EINVAL;
 	}
 
+	/* If we're prio multiqueue or are using round-robin, make
+	 * sure the number of incoming bands matches the number of
+	 * queues on the device we're associating with.
+	 */
+#ifdef CONFIG_NET_SCH_RR
+	if (strcmp("rr", sch->ops->id) == 0)
+		if (qopt->bands != sch->dev->egress_subqueue_count)
+			return -EINVAL;
+#endif
+
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+	if (strcmp("prio", sch->ops->id) == 0)
+		if (qopt->bands != sch->dev->egress_subqueue_count)
+			return -EINVAL;
+#endif
+
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
@@ -242,6 +312,18 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 			}
 		}
 	}
+
+	/* setup queue to band mapping */
+	for (i = 0, queue = 0; i < q->bands; i++, queue++)
+		q->band2queue[i] = queue;
+
+#ifndef CONFIG_NET_SCH_PRIO_MQ
+	/* for non-mq prio */
+	if (strcmp("prio", sch->ops->id) == 0)
+		for (i = 0; i < q->bands; i++)
+			q->band2queue[i] = 0;
+#endif
+
 	return 0;
 }
 
@@ -443,17 +525,44 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+#ifdef CONFIG_NET_SCH_RR
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+#endif
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	register_qdisc(&prio_qdisc_ops);
+#ifdef CONFIG_NET_SCH_RR
+	register_qdisc(&rr_qdisc_ops);
+#endif
+	return 0;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+#ifdef CONFIG_NET_SCH_RR
+	unregister_qdisc(&rr_qdisc_ops);
+#endif
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-21 21:26 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
@ 2007-06-21 23:47   ` Patrick McHardy
  2007-06-22  0:01     ` Waskiewicz Jr, Peter P
  2007-06-22 18:00     ` Waskiewicz Jr, Peter P
  0 siblings, 2 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-21 23:47 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 475df84..ca0b352 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -102,8 +102,16 @@ config NET_SCH_ATM
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_atm.
>  
> +config NET_SCH_BANDS
> +        bool "Multi Band Queueing (PRIO and RR)"
> +        ---help---
> +          Say Y here if you want to use n-band multiqueue packet
> +          schedulers.  These include a priority-based scheduler and
> +	   a round-robin scheduler.
> +
>  config NET_SCH_PRIO
>  	tristate "Multi Band Priority Queueing (PRIO)"
> +	depends on NET_SCH_BANDS
>  	---help---
>  	  Say Y here if you want to use an n-band priority queue packet
>  	  scheduler.
> @@ -111,6 +119,30 @@ config NET_SCH_PRIO
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_prio.
>  
> +config NET_SCH_PRIO_MQ
> +	bool "Multiple hardware queue support for PRIO"
> +	depends on NET_SCH_PRIO
> +	---help---
> +	  Say Y here if you want to allow the PRIO qdisc to assign
> +	  flows to multiple hardware queues on an ethernet device.  This
> +	  will still work on devices with 1 queue.
> +
> +	  Consider this scheduler for devices that do not use
> +	  hardware-based scheduling policies.  Otherwise, use NET_SCH_RR.
> +
> +	  Most people will say N here.
> +
> +config NET_SCH_RR
> +	bool "Multi Band Round Robin Queuing (RR)"
> +	depends on NET_SCH_BANDS && NET_SCH_PRIO
> +	---help---
> +	  Say Y here if you want to use an n-band round robin packet
> +	  scheduler.
> +
> +	  The module uses sch_prio for its framework and is aliased as
> +	  sch_rr, so it will load sch_prio, although it is referred
> +	  to using sch_rr.
>   

The dependencies seem to be very confused. SCHED_PRIO does not depend
on anything new, SCH_RR also doesn't depend on anything. SCH_PRIO_MQ
and SCH_RR_MQ (which is missing) depend on SCH_PRIO/SCH_RR. A single
NET_SCH_MULTIQUEUE option seems better than adding one per scheduler
though.

> --- a/net/sched/sch_prio.c
> +++ b/net/sched/sch_prio.c
> @@ -9,6 +9,8 @@
>   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
>   * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>:
>   *              Init --  EINVAL when opt undefined
> + * Additions:	Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
> + *		Added round-robin scheduling for selection at load-time
>   

git keeps changelogs, please don't add it here.

>   */
>  
>  #include <linux/module.h>
> @@ -40,9 +42,13 @@
>  struct prio_sched_data
>  {
>  	int bands;
> +#ifdef CONFIG_NET_SCH_RR
> +	int curband; /* for round-robin */
> +#endif
>  	struct tcf_proto *filter_list;
>  	u8  prio2band[TC_PRIO_MAX+1];
>  	struct Qdisc *queues[TCQ_PRIO_BANDS];
> +	u16 band2queue[TC_PRIO_MAX + 1];
>   

Why is this still here? Its a 1:1 mapping.
> @@ -211,6 +265,22 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
>  			return -EINVAL;
>  	}
>  
> +	/* If we're prio multiqueue or are using round-robin, make
> +	 * sure the number of incoming bands matches the number of
> +	 * queues on the device we're associating with.
> +	 */
> +#ifdef CONFIG_NET_SCH_RR
> +	if (strcmp("rr", sch->ops->id) == 0)
> +		if (qopt->bands != sch->dev->egress_subqueue_count)
> +			return -EINVAL;
> +#endif
> +
> +#ifdef CONFIG_NET_SCH_PRIO_MQ
> +	if (strcmp("prio", sch->ops->id) == 0)
> +		if (qopt->bands != sch->dev->egress_subqueue_count)
> +			return -EINVAL;
> +#endif
>   

For the tenth time now, the user should enable this at
runtime. You can't just break things dependant on config
options.



^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-21 23:47   ` Patrick McHardy
@ 2007-06-22  0:01     ` Waskiewicz Jr, Peter P
  2007-06-22  0:26       ` Patrick McHardy
  2007-06-22 18:00     ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-22  0:01 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> The dependencies seem to be very confused. SCHED_PRIO does 
> not depend on anything new, SCH_RR also doesn't depend on 
> anything. SCH_PRIO_MQ and SCH_RR_MQ (which is missing) depend 
> on SCH_PRIO/SCH_RR. A single NET_SCH_MULTIQUEUE option seems 
> better than adding one per scheduler though.

I agree with a NET_SCH_MULTIQUEUE option.  However, SCH_RR does depend
on SCH_PRIO being built since it's the same code, doesn't it?  Maybe I'm
not understanding something about the build process.  I'll clean this
up.

> 
> > --- a/net/sched/sch_prio.c
> > +++ b/net/sched/sch_prio.c
> > @@ -9,6 +9,8 @@
> >   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
> >   * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>:
> >   *              Init --  EINVAL when opt undefined
> > + * Additions:	Peter P. Waskiewicz Jr. 
> <peter.p.waskiewicz.jr@intel.com>
> > + *		Added round-robin scheduling for selection at load-time
> >   
> 
> git keeps changelogs, please don't add it here.

Roger.

> >  	struct tcf_proto *filter_list;
> >  	u8  prio2band[TC_PRIO_MAX+1];
> >  	struct Qdisc *queues[TCQ_PRIO_BANDS];
> > +	u16 band2queue[TC_PRIO_MAX + 1];
> >   
> 
> Why is this still here? Its a 1:1 mapping.

I'll fix this.

> > @@ -211,6 +265,22 @@ static int prio_tune(struct Qdisc 
> *sch, struct rtattr *opt)
> >  			return -EINVAL;
> >  	}
> >  
> > +	/* If we're prio multiqueue or are using round-robin, make
> > +	 * sure the number of incoming bands matches the number of
> > +	 * queues on the device we're associating with.
> > +	 */
> > +#ifdef CONFIG_NET_SCH_RR
> > +	if (strcmp("rr", sch->ops->id) == 0)
> > +		if (qopt->bands != sch->dev->egress_subqueue_count)
> > +			return -EINVAL;
> > +#endif
> > +
> > +#ifdef CONFIG_NET_SCH_PRIO_MQ
> > +	if (strcmp("prio", sch->ops->id) == 0)
> > +		if (qopt->bands != sch->dev->egress_subqueue_count)
> > +			return -EINVAL;
> > +#endif
> >   
> 
> For the tenth time now, the user should enable this at 
> runtime. You can't just break things dependant on config options.

I had this in sch_prio and tc before, and was told to remove it because
of ABI issues.  I can put it back in, but I'm not sure what those
previous ABI issues were.  Was it backwards compatibility that you
referred to before that was broken?

As always, the feedback is very much appreciated.  I'll get these fixes
in as soon as possible.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-22  0:01     ` Waskiewicz Jr, Peter P
@ 2007-06-22  0:26       ` Patrick McHardy
  0 siblings, 0 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-22  0:26 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>> The dependencies seem to be very confused. SCHED_PRIO does 
>> not depend on anything new, SCH_RR also doesn't depend on 
>> anything. SCH_PRIO_MQ and SCH_RR_MQ (which is missing) depend 
>> on SCH_PRIO/SCH_RR. A single NET_SCH_MULTIQUEUE option seems 
>> better than adding one per scheduler though.
>>     
>
> I agree with a NET_SCH_MULTIQUEUE option.  However, SCH_RR does depend
> on SCH_PRIO being built since it's the same code, doesn't it?  Maybe I'm
> not understanding something about the build process.  I'll clean this
> up.

The easiest solution is to select SCH_PRIO from SCH_RR.
I head something else in mind initially but that is
needlessly complicated.

>>
>> For the tenth time now, the user should enable this at 
>> runtime. You can't just break things dependant on config options.
>>     
>
> I had this in sch_prio and tc before, and was told to remove it because
> of ABI issues.  I can put it back in, but I'm not sure what those
> previous ABI issues were.  Was it backwards compatibility that you
> referred to before that was broken?

Your tc changes changed the structure in a way that old tc binaries
wouldn't work anymore. This version breaks configurations that use
a number of bands not matching the HW queues when the user enables
the multiqueue compile time option.

Unfortunately prio does not use nested attributes, so the easiest
way is extending struct tc_prio_qopt at the end and checking the
attribute size to decide whether its an old or a new version.

A better fix would be to introduce a new qdisc configuration
attribute that takes precedence before TCA_OPTIONS and have
userspace send both the old non-nested structure and a new
nested configuration. That would make sure we never run into
this problem again.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-21 23:47   ` Patrick McHardy
  2007-06-22  0:01     ` Waskiewicz Jr, Peter P
@ 2007-06-22 18:00     ` Waskiewicz Jr, Peter P
  2007-06-22 18:42       ` Patrick McHardy
  1 sibling, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-22 18:00 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> >  #include <linux/module.h>
> > @@ -40,9 +42,13 @@
> >  struct prio_sched_data
> >  {
> >  	int bands;
> > +#ifdef CONFIG_NET_SCH_RR
> > +	int curband; /* for round-robin */
> > +#endif
> >  	struct tcf_proto *filter_list;
> >  	u8  prio2band[TC_PRIO_MAX+1];
> >  	struct Qdisc *queues[TCQ_PRIO_BANDS];
> > +	u16 band2queue[TC_PRIO_MAX + 1];
> >   
> 
> Why is this still here? Its a 1:1 mapping.

Thought about this more last night and this morning.  As far as I can
tell, I still need this.  If the qdisc gets loaded with multiqueue
turned on, I can just use the value of band to assign
skb->queue_mapping.  But if the qdisc is loaded without multiqueue
support, then I need to assign a value of zero to queue_mapping, or not
assign it at all (it will be zero'd out before the call to ->enqueue()
in dev_queue_xmit()).  But I'd rather not have a conditional in the
hotpath checking if the qdisc is multiqueue; I'd rather have the array
to match the bands so I can just do an assignment.

What do you think?

Thanks,
-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-22 18:00     ` Waskiewicz Jr, Peter P
@ 2007-06-22 18:42       ` Patrick McHardy
  2007-06-22 18:44         ` Patrick McHardy
  2007-06-22 18:53         ` Patrick McHardy
  0 siblings, 2 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-22 18:42 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>>> #include <linux/module.h>
>>>@@ -40,9 +42,13 @@
>>> struct prio_sched_data
>>> {
>>> 	int bands;
>>>+#ifdef CONFIG_NET_SCH_RR
>>>+	int curband; /* for round-robin */
>>>+#endif
>>> 	struct tcf_proto *filter_list;
>>> 	u8  prio2band[TC_PRIO_MAX+1];
>>> 	struct Qdisc *queues[TCQ_PRIO_BANDS];
>>>+	u16 band2queue[TC_PRIO_MAX + 1];
>>>  
>>
>>Why is this still here? Its a 1:1 mapping.
> 
> 
> Thought about this more last night and this morning.  As far as I can
> tell, I still need this.  If the qdisc gets loaded with multiqueue
> turned on, I can just use the value of band to assign
> skb->queue_mapping.  But if the qdisc is loaded without multiqueue
> support, then I need to assign a value of zero to queue_mapping, or not
> assign it at all (it will be zero'd out before the call to ->enqueue()
> in dev_queue_xmit()).  But I'd rather not have a conditional in the
> hotpath checking if the qdisc is multiqueue; I'd rather have the array
> to match the bands so I can just do an assignment.
> 
> What do you think?


I very much doubt that it has any measurable impact. You can
also add a small inline function

void skb_set_queue_mapping(struct sk_buff *skb, unsigned int queue)
{
#ifdef CONFIG_NET_SCH_MULTIQUEUE
	skb->queue_mapping = queue;
#else
	skb->queue_mapping = 0;
#endif
}

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-22 18:42       ` Patrick McHardy
@ 2007-06-22 18:44         ` Patrick McHardy
  2007-06-22 18:53         ` Patrick McHardy
  1 sibling, 0 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-22 18:44 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H,
	hadi

Patrick McHardy wrote:
> void skb_set_queue_mapping(struct sk_buff *skb, unsigned int queue)
> {
> #ifdef CONFIG_NET_SCH_MULTIQUEUE
> 	skb->queue_mapping = queue;
> #else
> 	skb->queue_mapping = 0;
> #endif


Maybe even use it everywhere and guard skb->queue_mapping by
an #ifdef, on 32 bit it does enlarge the skb.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-22 18:42       ` Patrick McHardy
  2007-06-22 18:44         ` Patrick McHardy
@ 2007-06-22 18:53         ` Patrick McHardy
  2007-06-22 21:03           ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-22 18:53 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Patrick McHardy wrote:
> Waskiewicz Jr, Peter P wrote:
> 
>>Thought about this more last night and this morning.  As far as I can
>>tell, I still need this.  If the qdisc gets loaded with multiqueue
>>turned on, I can just use the value of band to assign
>>skb->queue_mapping.  But if the qdisc is loaded without multiqueue
>>support, then I need to assign a value of zero to queue_mapping, or not
>>assign it at all (it will be zero'd out before the call to ->enqueue()
>>in dev_queue_xmit()).  But I'd rather not have a conditional in the
>>hotpath checking if the qdisc is multiqueue; I'd rather have the array
>>to match the bands so I can just do an assignment.
>>
>>What do you think?
> 
> 
> 
> I very much doubt that it has any measurable impact. You can
> also add a small inline function
> 
> void skb_set_queue_mapping(struct sk_buff *skb, unsigned int queue)


OK I didn't really listen obviously :) A compile time option
won't help. Just remove it and assign it conditionally.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-22 18:53         ` Patrick McHardy
@ 2007-06-22 21:03           ` Waskiewicz Jr, Peter P
  0 siblings, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-22 21:03 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> Patrick McHardy wrote:
> > Waskiewicz Jr, Peter P wrote:
> > 
> >>Thought about this more last night and this morning.  As 
> far as I can 
> >>tell, I still need this.  If the qdisc gets loaded with multiqueue 
> >>turned on, I can just use the value of band to assign
> >>skb->queue_mapping.  But if the qdisc is loaded without multiqueue
> >>support, then I need to assign a value of zero to queue_mapping, or 
> >>not assign it at all (it will be zero'd out before the call to 
> >>->enqueue() in dev_queue_xmit()).  But I'd rather not have a 
> >>conditional in the hotpath checking if the qdisc is multiqueue; I'd 
> >>rather have the array to match the bands so I can just do 
> an assignment.
> >>
> >>What do you think?
> > 
> > 
> > 
> > I very much doubt that it has any measurable impact. You 
> can also add 
> > a small inline function
> > 
> > void skb_set_queue_mapping(struct sk_buff *skb, unsigned int queue)
> 
> 
> OK I didn't really listen obviously :) A compile time option 
> won't help. Just remove it and assign it conditionally.

Sounds good.  Thanks Patrick.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH] NET: Multiple queue hardware support
@ 2007-06-23 21:36 PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 1/3] NET: [DOC] Multiqueue hardware support documentation PJ Waskiewicz
                   ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-23 21:36 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, hadi, kaber

Please consider these patches for 2.6.23 inclusion.

These patches are built against Patrick McHardy's recently submitted
RTNETLINK nested compat attribute patches.  They're needed to preserve
ABI between sch_{rr|prio} and iproute2.

Updates since the last submission:

1. Added checks for netif_subqueue_stopped() to net/core/netpoll.c,
   net/core/pktgen.c, and to software device hard_start_xmit in
   dev_queue_xmit().

2. Removed TCA_PRIO_TEST and added TCA_PRIO_MQ for sch_prio and sch_rr.

3. Fixed dependancy issues in net/sched/Kconfig with NET_SCH_RR.

4. Implemented the new nested compat attribute API for MQ in NET_SCH_PRIO
   and NET_SCH_RR.

5. Allow sch_rr and sch_prio to turn multiqueue hardware support on and off
   at loadtime.

This patchset is an updated version of previous multiqueue network device
support patches.  The general approach of introducing a new API for multiqueue
network devices to register with the stack has remained.  The changes include
adding a round-robin qdisc, heavily based on sch_prio, which will allow
queueing to hardware with no OS-enforced queuing policy.  sch_prio still has
the multiqueue code in it, but has a Kconfig option to compile it out of the
qdisc.  This allows people with hardware containing scheduling policies to
use sch_rr (round-robin), and others without scheduling policies in hardware
to continue using sch_prio if they wish to have some notion of scheduling
priority.

The patches being sent are split into Documentation, Qdisc changes, and
core stack changes.  The requested e1000 changes are still being resolved,
and will be sent at a later date.

The patches to iproute2 for tc will be sent separately, to support sch_rr.

-- 
PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 1/3] NET: [DOC] Multiqueue hardware support documentation
  2007-06-23 21:36 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
@ 2007-06-23 21:36 ` PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
  2 siblings, 0 replies; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-23 21:36 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, hadi, kaber

Add a brief howto to Documentation/networking for multiqueue.  It
explains how to use the multiqueue API in a driver to support
multiqueue paths from the stack, as well as the qdiscs to use for
feeding a multiqueue device.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 Documentation/networking/multiqueue.txt |  106 +++++++++++++++++++++++++++++++
 1 files changed, 106 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
new file mode 100644
index 0000000..b7ede56
--- /dev/null
+++ b/Documentation/networking/multiqueue.txt
@@ -0,0 +1,106 @@
+
+		HOWTO for multiqueue network device support
+		===========================================
+
+Section 1: Base driver requirements for implementing multiqueue support
+Section 2: Qdisc support for multiqueue devices
+Section 3: Brief howto using PRIO or RR for multiqueue devices
+
+
+Intro: Kernel support for multiqueue devices
+---------------------------------------------------------
+
+Kernel support for multiqueue devices is only an API that is presented to the
+netdevice layer for base drivers to implement.  This feature is part of the
+core networking stack, and all network devices will be running on the
+multiqueue-aware stack.  If a base driver only has one queue, then these
+changes are transparent to that driver.
+
+
+Section 1: Base driver requirements for implementing multiqueue support
+-----------------------------------------------------------------------
+
+Base drivers are required to use the new alloc_etherdev_mq() or
+alloc_netdev_mq() functions to allocate the subqueues for the device.  The
+underlying kernel API will take care of the allocation and deallocation of
+the subqueue memory, as well as netdev configuration of where the queues
+exist in memory.
+
+The base driver will also need to manage the queues as it does the global
+netdev->queue_lock today.  Therefore base drivers should use the
+netif_{start|stop|wake}_subqueue() functions to manage each queue while the
+device is still operational.  netdev->queue_lock is still used when the device
+comes online or when it's completely shut down (unregister_netdev(), etc.).
+
+Finally, the base driver should indicate that it is a multiqueue device.  The
+feature flag NETIF_F_MULTI_QUEUE should be added to the netdev->features
+bitmap on device initialization.  Below is an example from e1000:
+
+#ifdef CONFIG_E1000_MQ
+	if ( (adapter->hw.mac.type == e1000_82571) ||
+	     (adapter->hw.mac.type == e1000_82572) ||
+	     (adapter->hw.mac.type == e1000_80003es2lan))
+		netdev->features |= NETIF_F_MULTI_QUEUE;
+#endif
+
+
+Section 2: Qdisc support for multiqueue devices
+-----------------------------------------------
+
+Currently two qdiscs support multiqueue devices.  A new round-robin qdisc,
+sch_rr, and sch_prio. The qdisc is responsible for classifying the skb's to
+bands and queues, and will store the queue mapping into skb->queue_mapping.
+Use this field in the base driver to determine which queue to send the skb
+to.
+
+sch_rr has been added for hardware that doesn't want scheduling policies from
+software, so it's a straight round-robin qdisc.  It uses the same syntax and
+classification priomap that sch_prio uses, so it should be intuitive to
+configure for people who've used sch_prio.
+
+The PRIO qdisc naturally plugs into a multiqueue device.  If PRIO has been
+built with NET_SCH_PRIO_MQ, then upon load, it will make sure the number of
+bands requested is equal to the number of queues on the hardware.  If they
+are equal, it sets a one-to-one mapping up between the queues and bands.  If
+they're not equal, it will not load the qdisc.  This is the same behavior
+for RR.  Once the association is made, any skb that is classified will have
+skb->queue_mapping set, which will allow the driver to properly queue skb's
+to multiple queues.
+
+
+Section 3: Brief howto using PRIO and RR for multiqueue devices
+---------------------------------------------------------------
+
+The userspace command 'tc,' part of the iproute2 package, is used to configure
+qdiscs.  To add the PRIO qdisc to your network device, assuming the device is
+called eth0, run the following command:
+
+# tc qdisc add dev eth0 root handle 1: prio bands 4 multiqueue
+
+This will create 4 bands, 0 being highest priority, and associate those bands
+to the queues on your NIC.  Assuming eth0 has 4 Tx queues, the band mapping
+would look like:
+
+band 0 => queue 0
+band 1 => queue 1
+band 2 => queue 2
+band 3 => queue 3
+
+Traffic will begin flowing through each queue if your TOS values are assigning
+traffic across the various bands.  For example, ssh traffic will always try to
+go out band 0 based on TOS -> Linux priority conversion (realtime traffic),
+so it will be sent out queue 0.  ICMP traffic (pings) fall into the "normal"
+traffic classification, which is band 1.  Therefore pings will be send out
+queue 1 on the NIC.
+
+Note the use of the multiqueue keyword.  This is only in versions of iproute2
+that support multiqueue networking devices; if this is omitted when loading
+a qdisc onto a multiqueue device, the qdisc will load and operate the same
+if it were loaded onto a single-queue device (i.e. - sends all traffic to
+queue 0).
+
+The behavior of tc filters remains the same, where it will override TOS priority
+classification.
+
+
+Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API
  2007-06-23 21:36 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 1/3] NET: [DOC] Multiqueue hardware support documentation PJ Waskiewicz
@ 2007-06-23 21:36 ` PJ Waskiewicz
  2007-06-24 12:00   ` Patrick McHardy
  2007-06-23 21:36 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
  2 siblings, 1 reply; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-23 21:36 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, hadi, kaber

Updated: Added checks for netif_subqueue_stopped() to netpoll,
pktgen, and software device dev_queue_xmit().  This will ensure
external events to these subsystems will be handled correctly if
a subqueue is shut down.

Add the multiqueue hardware device support API to the core network
stack.  Allow drivers to allocate multiple queues and manage them
at the netdev level if they choose to do so.

Added a new field to sk_buff, namely queue_mapping, for drivers to
know which tx_ring to select based on OS classification of the flow.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/etherdevice.h |    3 +-
 include/linux/netdevice.h   |   62 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/skbuff.h      |    4 ++-
 net/core/dev.c              |   27 +++++++++++++------
 net/core/netpoll.c          |    8 +++---
 net/core/pktgen.c           |   10 +++++--
 net/core/skbuff.c           |    3 ++
 net/ethernet/eth.c          |    9 +++---
 8 files changed, 104 insertions(+), 22 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index f48eb89..b3fbb54 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -39,7 +39,8 @@ extern void		eth_header_cache_update(struct hh_cache *hh, struct net_device *dev
 extern int		eth_header_cache(struct neighbour *neigh,
 					 struct hh_cache *hh);
 
-extern struct net_device *alloc_etherdev(int sizeof_priv);
+extern struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count);
+#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 
 /**
  * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e7913ee..6509eb4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -108,6 +108,14 @@ struct wireless_dev;
 #define MAX_HEADER (LL_MAX_HEADER + 48)
 #endif
 
+struct net_device_subqueue
+{
+	/* Give a control state for each queue.  This struct may contain
+	 * per-queue locks in the future.
+	 */
+	unsigned long	state;
+};
+
 /*
  *	Network device statistics. Akin to the 2.0 ether stats but
  *	with byte counters.
@@ -325,6 +333,7 @@ struct net_device
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
+#define NETIF_F_MULTI_QUEUE	16384	/* Has multiple TX/RX queues */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -543,6 +552,10 @@ struct net_device
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
+
+ 	/* The TX queue control structures */
+ 	int				egress_subqueue_count;
+ 	struct net_device_subqueue	egress_subqueue[0];
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -705,6 +718,48 @@ static inline int netif_running(const struct net_device *dev)
 	return test_bit(__LINK_STATE_START, &dev->state);
 }
 
+/*
+ * Routines to manage the subqueues on a device.  We only need start
+ * stop, and a check if it's stopped.  All other device management is
+ * done at the overall netdevice level.
+ * Also test the device if we're multiqueue.
+ */
+static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
+{
+	clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap())
+		return;
+#endif
+	set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline int netif_subqueue_stopped(const struct net_device *dev,
+                                         u16 queue_index)
+{
+	return test_bit(__LINK_STATE_XOFF,
+	                &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap())
+		return;
+#endif
+	if (test_and_clear_bit(__LINK_STATE_XOFF,
+	                       &dev->egress_subqueue[queue_index].state))
+		__netif_schedule(dev);
+}
+
+static inline int netif_is_multiqueue(const struct net_device *dev)
+{
+	return (!!(NETIF_F_MULTI_QUEUE & dev->features));
+}
 
 /* Use this variant when it is known for sure that it
  * is executing from interrupt context.
@@ -995,8 +1050,11 @@ static inline void netif_tx_disable(struct net_device *dev)
 extern void		ether_setup(struct net_device *dev);
 
 /* Support for loadable net-drivers */
-extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
-				       void (*setup)(struct net_device *));
+extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+					  void (*setup)(struct net_device *),
+					  int queue_count);
+#define alloc_netdev(sizeof_priv, name, setup) \
+	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
 /* Functions used for multicast support */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7367c7..01b5e25 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -197,6 +197,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
  *	@iif: ifindex of device we arrived on
+ *	@queue_mapping: Queue mapping for multiqueue devices
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
@@ -246,7 +247,8 @@ struct sk_buff {
 	ktime_t			tstamp;
 	struct net_device	*dev;
 	int			iif;
-	/* 4 byte hole on 64 bit*/
+	__u16			queue_mapping;
+	/* 2 byte hole on 64 bit*/
 
 	struct  dst_entry	*dst;
 	struct	sec_path	*sp;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2609062..9ea8a47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1429,7 +1429,9 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
-		if (unlikely(netif_queue_stopped(dev) && skb->next))
+		if (unlikely((netif_queue_stopped(dev) ||
+			     netif_subqueue_stopped(dev, skb->queue_mapping)) &&
+			     skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
 
@@ -1545,6 +1547,8 @@ gso:
 		spin_lock(&dev->queue_lock);
 		q = dev->qdisc;
 		if (q->enqueue) {
+			/* reset queue_mapping to zero */
+			skb->queue_mapping = 0;
 			rc = q->enqueue(skb, q);
 			qdisc_run(dev);
 			spin_unlock(&dev->queue_lock);
@@ -1574,7 +1578,8 @@ gso:
 
 			HARD_TX_LOCK(dev, cpu);
 
-			if (!netif_queue_stopped(dev)) {
+			if (!netif_queue_stopped(dev) &&
+			    !netif_subqueue_stopped(dev, skb->queue_mapping)) {
 				rc = 0;
 				if (!dev_hard_start_xmit(skb, dev)) {
 					HARD_TX_UNLOCK(dev);
@@ -3343,16 +3348,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
 }
 
 /**
- *	alloc_netdev - allocate network device
+ *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
  *	@name:		device name format string
  *	@setup:		callback to initialize device
+ *	@queue_count:	the number of subqueues to allocate
  *
  *	Allocates a struct net_device with private data area for driver use
- *	and performs basic initialization.
+ *	and performs basic initialization.  Also allocates subqueue structs
+ *	for each queue on the device at the end of the netdevice.
  */
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
-		void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+		void (*setup)(struct net_device *), int queue_count)
 {
 	void *p;
 	struct net_device *dev;
@@ -3361,7 +3368,9 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
 	/* ensure 32-byte alignment of both the device and private area */
-	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
+	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
+		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
+		     ~NETDEV_ALIGN_CONST;
 	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
 
 	p = kzalloc(alloc_size, GFP_KERNEL);
@@ -3377,12 +3386,14 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 	if (sizeof_priv)
 		dev->priv = netdev_priv(dev);
 
+  	dev->egress_subqueue_count = queue_count;
+
 	dev->get_stats = internal_stats;
 	setup(dev);
 	strcpy(dev->name, name);
 	return dev;
 }
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mq);
 
 /**
  *	free_netdev - free network device
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 758dafe..aac8acf 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -66,8 +66,9 @@ static void queue_process(struct work_struct *work)
 
 		local_irq_save(flags);
 		netif_tx_lock(dev);
-		if (netif_queue_stopped(dev) ||
-		    dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+		if ((netif_queue_stopped(dev) || 
+		     netif_subqueue_stopped(dev, skb->queue_mapping)) ||
+		     dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			netif_tx_unlock(dev);
 			local_irq_restore(flags);
@@ -254,7 +255,8 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 			/* try until next clock tick */
 			for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 					tries > 0; --tries) {
-				if (!netif_queue_stopped(dev))
+				if (!netif_queue_stopped(dev) &&
+				    !netif_subqueue_stopped(dev, skb->queue_mapping))
 					status = dev->hard_start_xmit(skb, dev);
 
 				if (status == NETDEV_TX_OK)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 9cd3a1c..dffe067 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3139,7 +3139,9 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		}
 	}
 
-	if (netif_queue_stopped(odev) || need_resched()) {
+	if ((netif_queue_stopped(odev) ||
+	     netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) ||
+	     need_resched()) {
 		idle_start = getCurUs();
 
 		if (!netif_running(odev)) {
@@ -3154,7 +3156,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		pkt_dev->idle_acc += getCurUs() - idle_start;
 
-		if (netif_queue_stopped(odev)) {
+		if (netif_queue_stopped(odev) ||
+		    netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) {
 			pkt_dev->next_tx_us = getCurUs();	/* TODO */
 			pkt_dev->next_tx_ns = 0;
 			goto out;	/* Try the next interface */
@@ -3181,7 +3184,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	}
 
 	netif_tx_lock_bh(odev);
-	if (!netif_queue_stopped(odev)) {
+	if (!netif_queue_stopped(odev) &&
+	    !netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) {
 
 		atomic_inc(&(pkt_dev->skb->users));
 	      retry_now:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7c6a34e..7bbed45 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -418,6 +418,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	n->nohdr = 0;
 	C(pkt_type);
 	C(ip_summed);
+	C(queue_mapping);
 	C(priority);
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
@@ -459,6 +460,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->sk		= NULL;
 	new->dev	= old->dev;
+	new->queue_mapping = old->queue_mapping;
 	new->priority	= old->priority;
 	new->protocol	= old->protocol;
 	new->dst	= dst_clone(old->dst);
@@ -1925,6 +1927,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		tail = nskb;
 
 		nskb->dev = skb->dev;
+		nskb->queue_mapping = skb->queue_mapping;
 		nskb->priority = skb->priority;
 		nskb->protocol = skb->protocol;
 		nskb->dst = dst_clone(skb->dst);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0ac2524..87a509c 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev)
 EXPORT_SYMBOL(ether_setup);
 
 /**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mq - Allocates and sets up an Ethernet device
  * @sizeof_priv: Size of additional driver-private structure to be allocated
  *	for this Ethernet device
+ * @queue_count: The number of queues this device has.
  *
  * Fill in the fields of the device structure with Ethernet-generic
  * values. Basically does everything except registering the device.
@@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup);
  * this private data area.
  */
 
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count)
 {
-	return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+	return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
 }
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(alloc_etherdev_mq);

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-23 21:36 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 1/3] NET: [DOC] Multiqueue hardware support documentation PJ Waskiewicz
  2007-06-23 21:36 ` [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API PJ Waskiewicz
@ 2007-06-23 21:36 ` PJ Waskiewicz
  2007-06-24 12:16   ` Patrick McHardy
  2007-06-24 22:22   ` Patrick McHardy
  2 siblings, 2 replies; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-23 21:36 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, hadi, kaber

Updated: This patch applies on top of Patrick McHardy's RTNETLINK
nested compat attribute patches.  These are required to preserve
ABI for iproute2 when working with the multiqueue qdiscs.

Add the new sch_rr qdisc for multiqueue network device support.
Allow sch_prio and sch_rr to be compiled with or without multiqueue hardware
support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/pkt_sched.h |    4 +-
 net/sched/Kconfig         |   30 +++++++++++++
 net/sched/sch_generic.c   |    3 +
 net/sched/sch_prio.c      |  106 ++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 129 insertions(+), 14 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 09808b7..ec3a9a5 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -103,8 +103,8 @@ struct tc_prio_qopt
 
 enum
 {
-	TCA_PRIO_UNPSEC,
-	TCA_PRIO_TEST,
+	TCA_PRIO_UNSPEC,
+	TCA_PRIO_MQ,
 	__TCA_PRIO_MAX
 };
 
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..7f14fa6 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -102,8 +102,16 @@ config NET_SCH_ATM
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_atm.
 
+config NET_SCH_BANDS
+        bool "Multi Band Queueing (PRIO and RR)"
+        ---help---
+          Say Y here if you want to use n-band multiqueue packet
+          schedulers.  These include a priority-based scheduler and
+	   a round-robin scheduler.
+
 config NET_SCH_PRIO
 	tristate "Multi Band Priority Queueing (PRIO)"
+	depends on NET_SCH_BANDS
 	---help---
 	  Say Y here if you want to use an n-band priority queue packet
 	  scheduler.
@@ -111,6 +119,28 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_RR
+	tristate "Multi Band Round Robin Queuing (RR)"
+	depends on NET_SCH_BANDS
+	select NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
+config NET_SCH_BANDS_MQ
+	bool "Multiple hardware queue support"
+	depends on NET_SCH_BANDS
+	---help---
+	  Say Y here if you want to allow the PRIO and RR qdiscs to assign
+	  flows to multiple hardware queues on an ethernet device.  This
+	  will still work on devices with 1 queue.
+
+	  Most people will say N here.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 9461e8a..203d5c4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -168,7 +168,8 @@ static inline int qdisc_restart(struct net_device *dev)
 	spin_unlock(&dev->queue_lock);
 
 	ret = NETDEV_TX_BUSY;
-	if (!netif_queue_stopped(dev))
+	if (!netif_queue_stopped(dev) &&
+	    !netif_subqueue_stopped(dev, skb->queue_mapping))
 		/* churn baby churn .. */
 		ret = dev_hard_start_xmit(skb, dev);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 40a13e8..8a716f0 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -40,9 +40,11 @@
 struct prio_sched_data
 {
 	int bands;
+	int curband; /* for round-robin */
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+	unsigned char mq;
 };
 
 
@@ -70,14 +72,28 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
+			if (q->mq)
+				skb->queue_mapping = 
+						q->prio2band[band&TC_PRIO_MAX];
+			else
+				skb->queue_mapping = 0;
 			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
-	if (band >= q->bands)
+	if (band >= q->bands) {
+		if (q->mq)
+			skb->queue_mapping = q->prio2band[0];
+		else
+			skb->queue_mapping = 0;
 		return q->queues[q->prio2band[0]];
+	}
 
+	if (q->mq)
+		skb->queue_mapping = band;
+	else
+		skb->queue_mapping = 0;
 	return q->queues[band];
 }
 
@@ -144,17 +160,57 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? prio : 0))) {
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
 		}
 	}
 	return NULL;
 
 }
 
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? q->curband : 0))) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+		}
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -202,7 +258,7 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 	struct rtattr *tb[TCA_PRIO_MAX];
 	int i;
 
-	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, (void *)&qopt,
+	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt,
 				       sizeof(*qopt)))
 		return -EINVAL;
 	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
@@ -213,8 +269,14 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 			return -EINVAL;
 	}
 
-	if (tb[TCA_PRIO_TEST-1])
-		printk("TCA_PRIO_TEST: %u\n", *(u32 *)RTA_DATA(tb[TCA_PRIO_TEST-1]));
+	/* If we're multiqueue, make sure the number of incoming bands
+	 * matches the number of queues on the device we're associating with.
+	 */
+	if (tb[TCA_PRIO_MQ - 1])
+		q->mq = *(unsigned char *)RTA_DATA(tb[TCA_PRIO_MQ - 1]);
+
+	if (q->mq && (qopt->bands != sch->dev->egress_subqueue_count))
+		return -EINVAL;
 
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
@@ -280,7 +342,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
 
 	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
-	RTA_PUT_U32(skb, TCA_PRIO_TEST, 321);
+	RTA_PUT_U8(skb, TCA_PRIO_MQ, q->mq);
 	RTA_NEST_COMPAT_END(skb, nest);
 	return skb->len;
 
@@ -452,17 +514,39 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	register_qdisc(&prio_qdisc_ops);
+	register_qdisc(&rr_qdisc_ops);
+
+	return 0;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+	unregister_qdisc(&rr_qdisc_ops);
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API
  2007-06-23 21:36 ` [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API PJ Waskiewicz
@ 2007-06-24 12:00   ` Patrick McHardy
  2007-06-25 16:25     ` Waskiewicz Jr, Peter P
  0 siblings, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-24 12:00 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:
> +struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> +		void (*setup)(struct net_device *), int queue_count)
>  {
>  	void *p;
>  	struct net_device *dev;
> @@ -3361,7 +3368,9 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
>  	BUG_ON(strlen(name) >= sizeof(dev->name));
>  
>  	/* ensure 32-byte alignment of both the device and private area */
> -	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
> +	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
> +		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &


Why queue_count - 1 ? It should be queue_count I think.


Otherwise ACK for this patch except that it should also contain the
sch_generic changes.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-23 21:36 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
@ 2007-06-24 12:16   ` Patrick McHardy
  2007-06-25 17:27     ` Waskiewicz Jr, Peter P
  2007-06-25 21:53     ` Waskiewicz Jr, Peter P
  2007-06-24 22:22   ` Patrick McHardy
  1 sibling, 2 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-24 12:16 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 09808b7..ec3a9a5 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -103,8 +103,8 @@ struct tc_prio_qopt
>  
>  enum
>  {
> -	TCA_PRIO_UNPSEC,
> -	TCA_PRIO_TEST,


You misunderstood me. You can work on top of my compat attribute
patches, but the example code should not have to go in to apply
your patch.


> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 475df84..7f14fa6 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -102,8 +102,16 @@ config NET_SCH_ATM
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_atm.
>  
> +config NET_SCH_BANDS
> +        bool "Multi Band Queueing (PRIO and RR)"

This options seems useless. Its not used *anywhere* except for
dependencies.

> +        ---help---
> +          Say Y here if you want to use n-band multiqueue packet
> +          schedulers.  These include a priority-based scheduler and
> +	   a round-robin scheduler.
> +
>  config NET_SCH_PRIO
>  	tristate "Multi Band Priority Queueing (PRIO)"
> +	depends on NET_SCH_BANDS

And this dependency as well.

>  	---help---
>  	  Say Y here if you want to use an n-band priority queue packet
>  	  scheduler.
> @@ -111,6 +119,28 @@ config NET_SCH_PRIO
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_prio.
>  
> +config NET_SCH_RR
> +	tristate "Multi Band Round Robin Queuing (RR)"
> +	depends on NET_SCH_BANDS

Same here. RR

> +	select NET_SCH_PRIO
> +	---help---
> +	  Say Y here if you want to use an n-band round robin packet
> +	  scheduler.
> +
> +	  The module uses sch_prio for its framework and is aliased as
> +	  sch_rr, so it will load sch_prio, although it is referred
> +	  to using sch_rr.
> +
> +config NET_SCH_BANDS_MQ
> +	bool "Multiple hardware queue support"
> +	depends on NET_SCH_BANDS


OK, again:

Introduce NET_SCH_RR. NET_SCH_RR selects NET_SCH_PRIO. Nothing at
all changes for NET_SCH_PRIO itself. Additionally introduce a
boolean NET_SCH_MULTIQUEUE. No dependencies at all. Use
NET_SCH_MULTIQUEUE to guard the multiqueue code in sch_prio.c.
Your current code doesn't even have any ifdefs anymore though,
so this might not be needed at all.

Additionally you could later introduce E1000_MULTIQUEUE and
have that select NET_SCH_MULTIQUEUE.

> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 9461e8a..203d5c4 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -168,7 +168,8 @@ static inline int qdisc_restart(struct net_device *dev)
>  	spin_unlock(&dev->queue_lock);
>  
>  	ret = NETDEV_TX_BUSY;
> -	if (!netif_queue_stopped(dev))
> +	if (!netif_queue_stopped(dev) &&
> +	    !netif_subqueue_stopped(dev, skb->queue_mapping))
>  		/* churn baby churn .. */
>  		ret = dev_hard_start_xmit(skb, dev);

I'll try again - please move this to patch 2/3.



> diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
> index 40a13e8..8a716f0 100644
> --- a/net/sched/sch_prio.c
> +++ b/net/sched/sch_prio.c
> @@ -40,9 +40,11 @@
>  struct prio_sched_data
>  {
>  	int bands;
> +	int curband; /* for round-robin */
>  	struct tcf_proto *filter_list;
>  	u8  prio2band[TC_PRIO_MAX+1];
>  	struct Qdisc *queues[TCQ_PRIO_BANDS];
> +	unsigned char mq;
>  };
>  
>  
> @@ -70,14 +72,28 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
>  #endif
>  			if (TC_H_MAJ(band))
>  				band = 0;
> +			if (q->mq)
> +				skb->queue_mapping = 
> +						q->prio2band[band&TC_PRIO_MAX];
> +			else
> +				skb->queue_mapping = 0;


Might look cleaner if you have one central point where queue_mapping is
set and the band is returned.

> +	/* If we're multiqueue, make sure the number of incoming bands
> +	 * matches the number of queues on the device we're associating with.
> +	 */
> +	if (tb[TCA_PRIO_MQ - 1])
> +		q->mq = *(unsigned char *)RTA_DATA(tb[TCA_PRIO_MQ - 1]);


If you're using it as a flag, please use RTA_GET_FLAG(),
otherwise RTA_GET_U8.

> +	if (q->mq && (qopt->bands != sch->dev->egress_subqueue_count))
> +		return -EINVAL;
>  
>  	sch_tree_lock(sch);
>  	q->bands = qopt->bands;
> @@ -280,7 +342,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
>  
>  	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
> -	RTA_PUT_U32(skb, TCA_PRIO_TEST, 321);
> +	RTA_PUT_U8(skb, TCA_PRIO_MQ, q->mq);


And RTA_PUT_FLAG. Now that I think of it, does it even makes sense
to have a prio private flag for this instead of a qdisc global one?

>  static int __init prio_module_init(void)
>  {
> -	return register_qdisc(&prio_qdisc_ops);
> +	register_qdisc(&prio_qdisc_ops);
> +	register_qdisc(&rr_qdisc_ops);

Proper error handling please.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-23 21:36 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
  2007-06-24 12:16   ` Patrick McHardy
@ 2007-06-24 22:22   ` Patrick McHardy
  2007-06-25 17:29     ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-24 22:22 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:
> +	/* If we're multiqueue, make sure the number of incoming bands
> +	 * matches the number of queues on the device we're associating with.
> +	 */
> +	if (tb[TCA_PRIO_MQ - 1])
> +		q->mq = *(unsigned char *)RTA_DATA(tb[TCA_PRIO_MQ - 1]);
> +
> +	if (q->mq && (qopt->bands != sch->dev->egress_subqueue_count))
> +		return -EINVAL;


A nice thing you could do for the user here is use
egress_subqueue_count as default when qopt->bands == 0
(and change tc prio to accept 0 in case it doesn't).

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API
  2007-06-24 12:00   ` Patrick McHardy
@ 2007-06-25 16:25     ` Waskiewicz Jr, Peter P
  0 siblings, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-25 16:25 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> >  	/* ensure 32-byte alignment of both the device and 
> private area */
> > -	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & 
> ~NETDEV_ALIGN_CONST;
> > +	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
> > +		     (sizeof(struct net_device_subqueue) * 
> (queue_count - 1))) &
> 
> 
> Why queue_count - 1 ? It should be queue_count I think.

I'm not sure what went through my head, but I'll fix this.

> Otherwise ACK for this patch except that it should also 
> contain the sch_generic changes.

I misread your previous mail; I'll get the sch_generic.c changes into
this patch.

Thanks Patrick,
-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-24 12:16   ` Patrick McHardy
@ 2007-06-25 17:27     ` Waskiewicz Jr, Peter P
  2007-06-25 17:29       ` Patrick McHardy
  2007-06-25 21:53     ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-25 17:27 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> >  enum
> >  {
> > -	TCA_PRIO_UNPSEC,
> > -	TCA_PRIO_TEST,
> 
> 
> You misunderstood me. You can work on top of my compat 
> attribute patches, but the example code should not have to go 
> in to apply your patch.

Ok.  I'll fix my patches.

> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 
> > 475df84..7f14fa6 100644
> > --- a/net/sched/Kconfig
> > +++ b/net/sched/Kconfig
> > @@ -102,8 +102,16 @@ config NET_SCH_ATM
> >  	  To compile this code as a module, choose M here: the
> >  	  module will be called sch_atm.
> >  
> > +config NET_SCH_BANDS
> > +        bool "Multi Band Queueing (PRIO and RR)"
> 
> This options seems useless. Its not used *anywhere* except 
> for dependencies.

I was trying to group the multiqueue qdiscs together with this.  But I
can see just having the multiqueue option for scheduling will cover
this.  I'll remove this.

> > +config NET_SCH_BANDS_MQ
> > +	bool "Multiple hardware queue support"
> > +	depends on NET_SCH_BANDS
> 
> 
> OK, again:
> 
> Introduce NET_SCH_RR. NET_SCH_RR selects NET_SCH_PRIO. 
> Nothing at all changes for NET_SCH_PRIO itself. Additionally 
> introduce a boolean NET_SCH_MULTIQUEUE. No dependencies at 
> all. Use NET_SCH_MULTIQUEUE to guard the multiqueue code in 
> sch_prio.c.
> Your current code doesn't even have any ifdefs anymore 
> though, so this might not be needed at all.
> 
> Additionally you could later introduce E1000_MULTIQUEUE and 
> have that select NET_SCH_MULTIQUEUE.

I'll clean this up.  Thanks for the persistance.  :)

> > diff --git a/net/sched/sch_generic.c 
> b/net/sched/sch_generic.c index 
> > 9461e8a..203d5c4 100644
> > --- a/net/sched/sch_generic.c
> > +++ b/net/sched/sch_generic.c
> > @@ -168,7 +168,8 @@ static inline int qdisc_restart(struct 
> net_device *dev)
> >  	spin_unlock(&dev->queue_lock);
> >  
> >  	ret = NETDEV_TX_BUSY;
> > -	if (!netif_queue_stopped(dev))
> > +	if (!netif_queue_stopped(dev) &&
> > +	    !netif_subqueue_stopped(dev, skb->queue_mapping))
> >  		/* churn baby churn .. */
> >  		ret = dev_hard_start_xmit(skb, dev);
> 
> I'll try again - please move this to patch 2/3.

I'm sorry; I misread your original comment about this.  I'll move the
change (although this disappears with Jamal's and KK's qdisc_restart()
cleanup).

> > diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 
> > 40a13e8..8a716f0 100644
> > --- a/net/sched/sch_prio.c
> > +++ b/net/sched/sch_prio.c
> > @@ -40,9 +40,11 @@
> >  struct prio_sched_data
> >  {
> >  	int bands;
> > +	int curband; /* for round-robin */
> >  	struct tcf_proto *filter_list;
> >  	u8  prio2band[TC_PRIO_MAX+1];
> >  	struct Qdisc *queues[TCQ_PRIO_BANDS];
> > +	unsigned char mq;
> >  };
> >  
> >  
> > @@ -70,14 +72,28 @@ prio_classify(struct sk_buff *skb, struct Qdisc 
> > *sch, int *qerr)  #endif
> >  			if (TC_H_MAJ(band))
> >  				band = 0;
> > +			if (q->mq)
> > +				skb->queue_mapping = 
> > +						
> q->prio2band[band&TC_PRIO_MAX];
> > +			else
> > +				skb->queue_mapping = 0;
> 
> 
> Might look cleaner if you have one central point where 
> queue_mapping is set and the band is returned.

I'll see how easy it'll be to condense this; because the queue being
selected in the qdisc can be different based on a few different things,
I'm not sure how easy it'll be to assign this in one spot.  I'll play
around with it and see what I can come up with.

> > +	/* If we're multiqueue, make sure the number of incoming bands
> > +	 * matches the number of queues on the device we're 
> associating with.
> > +	 */
> > +	if (tb[TCA_PRIO_MQ - 1])
> > +		q->mq = *(unsigned char *)RTA_DATA(tb[TCA_PRIO_MQ - 1]);
> 
> 
> If you're using it as a flag, please use RTA_GET_FLAG(), 
> otherwise RTA_GET_U8.

Will do.  Thanks.

> > +	if (q->mq && (qopt->bands != sch->dev->egress_subqueue_count))
> > +		return -EINVAL;
> >  
> >  	sch_tree_lock(sch);
> >  	q->bands = qopt->bands;
> > @@ -280,7 +342,7 @@ static int prio_dump(struct Qdisc *sch, 
> struct sk_buff *skb)
> >  	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
> >  
> >  	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
> > -	RTA_PUT_U32(skb, TCA_PRIO_TEST, 321);
> > +	RTA_PUT_U8(skb, TCA_PRIO_MQ, q->mq);
> 
> 
> And RTA_PUT_FLAG. Now that I think of it, does it even makes 
> sense to have a prio private flag for this instead of a qdisc 
> global one?

There currently aren't any other qdiscs that are natural fits for
multiqueue that I can see.  I can see the benefit though of having this
as a global flag in the qdisc API; let me check it out, and if it makes
sense, I can move it.

> >  static int __init prio_module_init(void)  {
> > -	return register_qdisc(&prio_qdisc_ops);
> > +	register_qdisc(&prio_qdisc_ops);
> > +	register_qdisc(&rr_qdisc_ops);
> 
> Proper error handling please.

Will do.

Thanks,
-PJ Waskiewicz

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-24 22:22   ` Patrick McHardy
@ 2007-06-25 17:29     ` Waskiewicz Jr, Peter P
  0 siblings, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-25 17:29 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> PJ Waskiewicz wrote:
> > +	/* If we're multiqueue, make sure the number of incoming bands
> > +	 * matches the number of queues on the device we're 
> associating with.
> > +	 */
> > +	if (tb[TCA_PRIO_MQ - 1])
> > +		q->mq = *(unsigned char *)RTA_DATA(tb[TCA_PRIO_MQ - 1]);
> > +
> > +	if (q->mq && (qopt->bands != sch->dev->egress_subqueue_count))
> > +		return -EINVAL;
> 
> 
> A nice thing you could do for the user here is use 
> egress_subqueue_count as default when qopt->bands == 0 (and 
> change tc prio to accept 0 in case it doesn't).

prio only allows a minimum of 2 bands right now.  I see what you're
suggesting though; let me think about this.  I do like this suggestion.

Thanks,
-PJ Waskiewicz

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-25 17:27     ` Waskiewicz Jr, Peter P
@ 2007-06-25 17:29       ` Patrick McHardy
  0 siblings, 0 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-25 17:29 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>
>> And RTA_PUT_FLAG. Now that I think of it, does it even makes 
>> sense to have a prio private flag for this instead of a qdisc 
>> global one?
>>     
>
> There currently aren't any other qdiscs that are natural fits for
> multiqueue that I can see.  I can see the benefit though of having this
> as a global flag in the qdisc API; let me check it out, and if it makes
> sense, I can move it.
>   

Yes, that thought occured to me as well. Keeping it private
seems better.



^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-24 12:16   ` Patrick McHardy
  2007-06-25 17:27     ` Waskiewicz Jr, Peter P
@ 2007-06-25 21:53     ` Waskiewicz Jr, Peter P
  2007-06-25 21:58       ` Patrick McHardy
  1 sibling, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-25 21:53 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> > @@ -70,14 +72,28 @@ prio_classify(struct sk_buff *skb, struct Qdisc 
> > *sch, int *qerr)  #endif
> >  			if (TC_H_MAJ(band))
> >  				band = 0;
> > +			if (q->mq)
> > +				skb->queue_mapping = 
> > +						
> q->prio2band[band&TC_PRIO_MAX];
> > +			else
> > +				skb->queue_mapping = 0;
> 
> 
> Might look cleaner if you have one central point where 
> queue_mapping is set and the band is returned.

I've taken a stab at this.  I can have one return point, but I'll still
have multiple assignments of skb->queue_mapping due to the different
branches for which queue to select in the qdisc.  I suppose we can do a
rewrite of prio_classify(), but to me that seems beyond the scope of the
multiqueue patches themselves.  What do you think?

Thanks,
-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-25 21:53     ` Waskiewicz Jr, Peter P
@ 2007-06-25 21:58       ` Patrick McHardy
  2007-06-25 22:07         ` Waskiewicz Jr, Peter P
  0 siblings, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-25 21:58 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>>> @@ -70,14 +72,28 @@ prio_classify(struct sk_buff *skb, struct Qdisc 
>>> *sch, int *qerr)  #endif
>>>  			if (TC_H_MAJ(band))
>>>  				band = 0;
>>> +			if (q->mq)
>>> +				skb->queue_mapping = 
>>> +						
>>>       
>> q->prio2band[band&TC_PRIO_MAX];
>>     
>>> +			else
>>> +				skb->queue_mapping = 0;
>>>       
>> Might look cleaner if you have one central point where 
>> queue_mapping is set and the band is returned.
>>     
>
> I've taken a stab at this.  I can have one return point, but I'll still
> have multiple assignments of skb->queue_mapping due to the different
> branches for which queue to select in the qdisc.  I suppose we can do a
> rewrite of prio_classify(), but to me that seems beyond the scope of the
> multiqueue patches themselves.  What do you think?
>   

Thats not necessary. I just though you could add one exit point:


...
out:
    skb->queue_mapping = q->mq ? band : 0;
    return q->queues[band];
}

But if that doesn't work don't bother ..


^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-25 21:58       ` Patrick McHardy
@ 2007-06-25 22:07         ` Waskiewicz Jr, Peter P
  0 siblings, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-25 22:07 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> Thats not necessary. I just though you could add one exit point:
> 
> 
> ...
> out:
>     skb->queue_mapping = q->mq ? band : 0;
>     return q->queues[band];
> }
> 
> But if that doesn't work don't bother ..

Unfortunately it won't, given how band might be used like this to select
the queue:

return q->queues[q->prio2band[band&TC_PRIO_MAX]];

I'll keep this in mind though, and if it can be done cleanly, I'll
submit a patch.

Thanks Patrick,
-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:20 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
@ 2007-06-28 16:21 ` PJ Waskiewicz
  2007-06-28 16:35   ` Patrick McHardy
  2007-06-28 17:13   ` Patrick McHardy
  0 siblings, 2 replies; 40+ messages in thread
From: PJ Waskiewicz @ 2007-06-28 16:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, jeff, auke-jan.h.kok, hadi, kaber

Updated: Cleaned up Kconfig options for multiqueue.  Cleaned up
sch_rr and sch_prio multiqueue handling.  Added nested compat netlink
options for new options.  Allowing a 0 band option for prio and rr when
in multiqueue mode so it defaults to the number of queues on the NIC.

Add the new sch_rr qdisc for multiqueue network device support.
Allow sch_prio and sch_rr to be compiled with or without multiqueue
hardware
support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/pkt_sched.h |    9 +++
 net/sched/Kconfig         |   23 +++++++
 net/sched/sch_prio.c      |  147 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 166 insertions(+), 13 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f353..268c515 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -101,6 +101,15 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+enum
+{
+	TCA_PRIO_UNSPEC,
+	TCA_PRIO_MQ,
+	__TCA_PRIO_MAX
+};
+
+#define TCA_PRIO_MAX    (__TCA_PRIO_MAX - 1)
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..65ee9e7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,29 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_RR
+	tristate "Multi Band Round Robin Queuing (RR)"
+	select NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
+config NET_SCH_MULTIQUEUE
+	bool "Multiple hardware queue support"
+	---help---
+	  Say Y here if you want to allow supported qdiscs to assign flows to
+	  multiple hardware queues on an ethernet device.  This will
+	  still work on devices with 1 queue.
+
+	  Current qdiscs supporting this feature are NET_SCH_PRIO and
+	  NET_SCH_RR.
+
+	  Most people will say N here.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6d7542c..2ceba92 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -40,9 +40,13 @@
 struct prio_sched_data
 {
 	int bands;
+	int curband; /* for round-robin */
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	unsigned char mq;
+#endif
 };
 
 
@@ -70,14 +74,34 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+			if (q->mq)
+				skb->queue_mapping = 
+						q->prio2band[band&TC_PRIO_MAX];
+			else
+				skb->queue_mapping = 0;
+#endif
 			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
-	if (band >= q->bands)
+	if (band >= q->bands) {
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		if (q->mq)
+			skb->queue_mapping = q->prio2band[0];
+		else
+			skb->queue_mapping = 0;
+#endif
 		return q->queues[q->prio2band[0]];
+	}
 
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	if (q->mq)
+		skb->queue_mapping = band;
+	else
+		skb->queue_mapping = 0;
+#endif
 	return q->queues[band];
 }
 
@@ -144,17 +168,65 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? prio : 0))) {
+#endif
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
 		}
+#endif
 	}
 	return NULL;
 
 }
 
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? q->curband : 0))) {
+#endif
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		}
+#endif
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -198,21 +270,39 @@ prio_destroy(struct Qdisc* sch)
 static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tc_prio_qopt *qopt = RTA_DATA(opt);
+	struct tc_prio_qopt *qopt;
+	struct rtattr *tb[TCA_PRIO_MAX];
 	int i;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt,
+				       sizeof(*qopt)))
 		return -EINVAL;
-	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+	q->bands = qopt->bands;
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	/* If we're multiqueue, make sure the number of incoming bands
+	 * matches the number of queues on the device we're associating with.
+	 * If the number of bands requested is zero, then set q->bands to
+	 * dev->egress_subqueue_count.
+	 */
+	q->mq = RTA_GET_FLAG(tb[TCA_PRIO_MQ - 1]);
+
+	if (q->mq) {
+		if (q->bands == 0)
+			q->bands = sch->dev->egress_subqueue_count;
+		else if (q->bands != sch->dev->egress_subqueue_count)
+			return -EINVAL;
+	}
+#endif
+
+	if (q->bands > TCQ_PRIO_BANDS || q->bands < 2)
 		return -EINVAL;
 
 	for (i=0; i<=TC_PRIO_MAX; i++) {
-		if (qopt->priomap[i] >= qopt->bands)
+		if (qopt->priomap[i] >= q->bands)
 			return -EINVAL;
 	}
 
 	sch_tree_lock(sch);
-	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
 	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
@@ -268,11 +358,19 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *nest;
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	if (q->mq)
+		RTA_PUT_FLAG(skb, TCA_PRIO_MQ);
+#endif
+	RTA_NEST_COMPAT_END(skb, nest);
+
 	return skb->len;
 
 rtattr_failure:
@@ -443,17 +541,40 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	int err;
+	err = register_qdisc(&prio_qdisc_ops);
+	if (!err)
+		err = register_qdisc(&rr_qdisc_ops);
+	return err;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+	unregister_qdisc(&rr_qdisc_ops);
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:21 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
@ 2007-06-28 16:35   ` Patrick McHardy
  2007-06-28 16:43     ` Waskiewicz Jr, Peter P
  2007-06-28 16:50     ` Patrick McHardy
  2007-06-28 17:13   ` Patrick McHardy
  1 sibling, 2 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 16:35 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:

> +
>  static int __init prio_module_init(void)
>  {
> -	return register_qdisc(&prio_qdisc_ops);
> +	int err;
> +	err = register_qdisc(&prio_qdisc_ops);
> +	if (!err)
> +		err = register_qdisc(&rr_qdisc_ops);
> +	return err;
>  }
>  

Thats still broken. I'll fix this and some minor cleanness issues myself so
you don't have to go through another resend.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:35   ` Patrick McHardy
@ 2007-06-28 16:43     ` Waskiewicz Jr, Peter P
  2007-06-28 16:46       ` Patrick McHardy
  2007-06-28 16:50     ` Patrick McHardy
  1 sibling, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-28 16:43 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> PJ Waskiewicz wrote:
> 
> > +
> >  static int __init prio_module_init(void)  {
> > -	return register_qdisc(&prio_qdisc_ops);
> > +	int err;
> > +	err = register_qdisc(&prio_qdisc_ops);
> > +	if (!err)
> > +		err = register_qdisc(&rr_qdisc_ops);
> > +	return err;
> >  }
> >  
> 
> Thats still broken. I'll fix this and some minor cleanness 
> issues myself so you don't have to go through another resend.

Auke and I just looked at register_qdisc() and this code.  Maybe we
haven't had enough coffee yet, but register_qdisc() returns 0 on
success.  So if register_qdisc(&prio_qdisc_ops) succeeds, then
rr_qdisc_ops gets registered.  I'm curious what is broken with this.

Thanks Patrick
-PJ Waskiewicz

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:43     ` Waskiewicz Jr, Peter P
@ 2007-06-28 16:46       ` Patrick McHardy
  2007-06-28 16:50         ` Waskiewicz Jr, Peter P
  0 siblings, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 16:46 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>>PJ Waskiewicz wrote:
>>
>>
>>>+
>>> static int __init prio_module_init(void)  {
>>>-	return register_qdisc(&prio_qdisc_ops);
>>>+	int err;
>>>+	err = register_qdisc(&prio_qdisc_ops);
>>>+	if (!err)
>>>+		err = register_qdisc(&rr_qdisc_ops);
>>>+	return err;
>>> }
>>> 
>>
>>Thats still broken. I'll fix this and some minor cleanness 
>>issues myself so you don't have to go through another resend.
> 
> 
> Auke and I just looked at register_qdisc() and this code.  Maybe we
> haven't had enough coffee yet, but register_qdisc() returns 0 on
> success.  So if register_qdisc(&prio_qdisc_ops) succeeds, then
> rr_qdisc_ops gets registered.  I'm curious what is broken with this.

Its not error handling. You do:

err = register qdisc 1
if (err)
	return err;
err = register qdisc 2
if (err)
	unregister qdisc 2
return err

anyways, I already fixed that and cleaned up prio_classify the
way I suggested. Will send shortly.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:46       ` Patrick McHardy
@ 2007-06-28 16:50         ` Waskiewicz Jr, Peter P
  2007-06-28 16:53           ` Patrick McHardy
  0 siblings, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-28 16:50 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi


> Its not error handling. You do:
> 
> err = register qdisc 1
> if (err)
> 	return err;
> err = register qdisc 2
> if (err)
> 	unregister qdisc 2
> return err
> 
> anyways, I already fixed that and cleaned up prio_classify 
> the way I suggested. Will send shortly.

Thanks for fixing; however, the current sch_prio doesn't unregister the
qdisc if register_qdisc() on prio fails, or does that happen implicitly
because the module will probably unload?

Thanks again Patrick,
-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:35   ` Patrick McHardy
  2007-06-28 16:43     ` Waskiewicz Jr, Peter P
@ 2007-06-28 16:50     ` Patrick McHardy
  1 sibling, 0 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 16:50 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: PJ Waskiewicz, davem, netdev, jeff, auke-jan.h.kok, hadi

[-- Attachment #1: Type: text/plain, Size: 806 bytes --]

Patrick McHardy wrote:
> PJ Waskiewicz wrote:
> 
>> +
>>  static int __init prio_module_init(void)
>>  {
>> -    return register_qdisc(&prio_qdisc_ops);
>> +    int err;
>> +    err = register_qdisc(&prio_qdisc_ops);
>> +    if (!err)
>> +        err = register_qdisc(&rr_qdisc_ops);
>> +    return err;
>>  }
>>  
> 
> 
> Thats still broken. I'll fix this and some minor cleanness issues myself so
> you don't have to go through another resend.


Here it is, fixed error handling and cleaned up prio_classify.

There are still too many ifdefs in there for my taste, and I'm
wondering whether the NET_SCH_MULTIQUEUE option should really
be NETDEVICES_MULTIQUEUE. That would allow to move the #ifdefs
to netif_subqueue_stopped and keep the qdiscs clean.
I can send a patch for that on top of your patches.


[-- Attachment #2: x --]
[-- Type: text/plain, Size: 8182 bytes --]

[SCHED] Qdisc changes and sch_rr added for multiqueue

Add the new sch_rr qdisc for multiqueue network device support.
Allow sch_prio and sch_rr to be compiled with or without multiqueue
hardware support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit ab2c102d05981e983b494e9211e1c987ae0b80ee
tree edb9457aeed6eb4ac8adef517b783b3361ba8830
parent 1937d8b868253885584dd49fe7ba804eda6b525f
author Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com> Thu, 28 Jun 2007 18:47:49 +0200
committer Patrick McHardy <kaber@trash.net> Thu, 28 Jun 2007 18:47:49 +0200

 include/linux/pkt_sched.h |    9 +++
 net/sched/Kconfig         |   23 +++++++
 net/sched/sch_prio.c      |  142 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 159 insertions(+), 15 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f353..268c515 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -101,6 +101,15 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+enum
+{
+	TCA_PRIO_UNSPEC,
+	TCA_PRIO_MQ,
+	__TCA_PRIO_MAX
+};
+
+#define TCA_PRIO_MAX    (__TCA_PRIO_MAX - 1)
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..65ee9e7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,29 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_RR
+	tristate "Multi Band Round Robin Queuing (RR)"
+	select NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
+config NET_SCH_MULTIQUEUE
+	bool "Multiple hardware queue support"
+	---help---
+	  Say Y here if you want to allow supported qdiscs to assign flows to
+	  multiple hardware queues on an ethernet device.  This will
+	  still work on devices with 1 queue.
+
+	  Current qdiscs supporting this feature are NET_SCH_PRIO and
+	  NET_SCH_RR.
+
+	  Most people will say N here.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6d7542c..b35b9c5 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -40,9 +40,13 @@
 struct prio_sched_data
 {
 	int bands;
+	int curband; /* for round-robin */
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	unsigned char mq;
+#endif
 };
 
 
@@ -70,14 +74,21 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
-			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+			band = q->prio2band[band&TC_PRIO_MAX];
+			goto out;
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
 	if (band >= q->bands)
-		return q->queues[q->prio2band[0]];
-
+		band = q->prio2band[0];
+out:
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	if (q->mq)
+		skb->queue_mapping = band;
+	else
+		skb->queue_mapping = 0;
+#endif
 	return q->queues[band];
 }
 
@@ -144,17 +155,65 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? prio : 0))) {
+#endif
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
 		}
+#endif
 	}
 	return NULL;
 
 }
 
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? q->curband : 0))) {
+#endif
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+		}
+#endif
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -198,21 +257,39 @@ prio_destroy(struct Qdisc* sch)
 static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tc_prio_qopt *qopt = RTA_DATA(opt);
+	struct tc_prio_qopt *qopt;
+	struct rtattr *tb[TCA_PRIO_MAX];
 	int i;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt,
+				       sizeof(*qopt)))
 		return -EINVAL;
-	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+	q->bands = qopt->bands;
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	/* If we're multiqueue, make sure the number of incoming bands
+	 * matches the number of queues on the device we're associating with.
+	 * If the number of bands requested is zero, then set q->bands to
+	 * dev->egress_subqueue_count.
+	 */
+	q->mq = RTA_GET_FLAG(tb[TCA_PRIO_MQ - 1]);
+
+	if (q->mq) {
+		if (q->bands == 0)
+			q->bands = sch->dev->egress_subqueue_count;
+		else if (q->bands != sch->dev->egress_subqueue_count)
+			return -EINVAL;
+	}
+#endif
+
+	if (q->bands > TCQ_PRIO_BANDS || q->bands < 2)
 		return -EINVAL;
 
 	for (i=0; i<=TC_PRIO_MAX; i++) {
-		if (qopt->priomap[i] >= qopt->bands)
+		if (qopt->priomap[i] >= q->bands)
 			return -EINVAL;
 	}
 
 	sch_tree_lock(sch);
-	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
 	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
@@ -268,11 +345,19 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *nest;
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+#ifdef CONFIG_NET_SCH_MULTIQUEUE
+	if (q->mq)
+		RTA_PUT_FLAG(skb, TCA_PRIO_MQ);
+#endif
+	RTA_NEST_COMPAT_END(skb, nest);
+
 	return skb->len;
 
 rtattr_failure:
@@ -443,17 +528,44 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	int err;
+
+	err = register_qdisc(&prio_qdisc_ops);
+	if (err < 0)
+		return err;
+	err = register_qdisc(&rr_qdisc_ops);
+	if (err < 0)
+		unregister_qdisc(&prio_qdisc_ops);
+	return err;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+	unregister_qdisc(&rr_qdisc_ops);
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:50         ` Waskiewicz Jr, Peter P
@ 2007-06-28 16:53           ` Patrick McHardy
  0 siblings, 0 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 16:53 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
> Thanks for fixing; however, the current sch_prio doesn't unregister the
> qdisc if register_qdisc() on prio fails, or does that happen implicitly
> because the module will probably unload?


It failed, there's nothing to unregister. But when you register two
qdiscs and the second one fails you have to unregister the first one.

Your way works too, but it might fail registering the second one without
providing any feedback to the user.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 16:21 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
  2007-06-28 16:35   ` Patrick McHardy
@ 2007-06-28 17:13   ` Patrick McHardy
  2007-06-28 19:04     ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 17:13 UTC (permalink / raw)
  To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok, hadi

PJ Waskiewicz wrote:
> +#ifdef CONFIG_NET_SCH_MULTIQUEUE
> +			if (q->mq)
> +				skb->queue_mapping = 
> +						q->prio2band[band&TC_PRIO_MAX];
> +			else
> +				skb->queue_mapping = 0;
> +#endif


Setting it to zero here is wrong, consider:

root qdisc: prio multiqueue
child qdisc: prio non-multiqueue

The top-level qdisc will set it, the child qdisc will unset it again.
When multiqueue is inactive it should not touch it.

I'll fix that as well.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 17:13   ` Patrick McHardy
@ 2007-06-28 19:04     ` Waskiewicz Jr, Peter P
  2007-06-28 19:17       ` Patrick McHardy
  0 siblings, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-28 19:04 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> PJ Waskiewicz wrote:
> > +#ifdef CONFIG_NET_SCH_MULTIQUEUE
> > +			if (q->mq)
> > +				skb->queue_mapping = 
> > +						
> q->prio2band[band&TC_PRIO_MAX];
> > +			else
> > +				skb->queue_mapping = 0;
> > +#endif
> 
> 
> Setting it to zero here is wrong, consider:
> 
> root qdisc: prio multiqueue
> child qdisc: prio non-multiqueue
> 
> The top-level qdisc will set it, the child qdisc will unset it again.
> When multiqueue is inactive it should not touch it.
> 
> I'll fix that as well.

But the child can't assume the device is multiqueue if the child is
non-multiqueue.  This is the same issue with IP forwarding, where if you
forward through a multiqueue device to a non-mq device, you don't know
if the destination device is multiqueue.  So the last qdisc to actually
dequeue into a device should have control what the queue mapping is.  If
a user had a multiqueue qdisc as root, and configured a child qdisc as
non-mq, that is a configuration error if the underlying device is indeed
multiqueue IMO.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 19:04     ` Waskiewicz Jr, Peter P
@ 2007-06-28 19:17       ` Patrick McHardy
  2007-06-28 19:21         ` Waskiewicz Jr, Peter P
  0 siblings, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 19:17 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

Waskiewicz Jr, Peter P wrote:
>>PJ Waskiewicz wrote:
>>
>>>+#ifdef CONFIG_NET_SCH_MULTIQUEUE
>>>+			if (q->mq)
>>>+				skb->queue_mapping = 
>>>+						
>>
>>q->prio2band[band&TC_PRIO_MAX];
>>
>>>+			else
>>>+				skb->queue_mapping = 0;
>>>+#endif
>>
>>
>>Setting it to zero here is wrong, consider:
>>
>>root qdisc: prio multiqueue
>>child qdisc: prio non-multiqueue
>>
>>The top-level qdisc will set it, the child qdisc will unset it again.
>>When multiqueue is inactive it should not touch it.
>>
>>I'll fix that as well.
> 
> 
> But the child can't assume the device is multiqueue if the child is
> non-multiqueue.

The child doesn't have to assume anything.

> This is the same issue with IP forwarding, where if you
> forward through a multiqueue device to a non-mq device, you don't know
> if the destination device is multiqueue.

No its not. I'm talking about nested qdiscs, which are all on
a single device.

> So the last qdisc to actually
> dequeue into a device should have control what the queue mapping is.

Fully agreed. And that is always the top-level qdisc.

> If
> a user had a multiqueue qdisc as root, and configured a child qdisc as
> non-mq, that is a configuration error if the underlying device is indeed
> multiqueue IMO.

Absolutely not. First of all, its perfectly valid to use non-multiqueue
qdiscs on multiqueue devices. Secondly, its only the root qdisc that
has to know about multiqueue since that one controls the child qdiscs.

Think about it, it makes absolutely no sense to have the child
qdisc even know about multiqueue. Changing my example to have
a multiqueue qdisc as child:

root qdisc: 2 band prio multiqueue
child qdisc of band 0: 2 band prio multiqueue

When the root qdisc decides to dequeue band0, it checks whether subqueue
0 is active and dequeues the child qdisc. If the child qdisc is indeed
another multiqueue qdisc as you suggest, if might decide to dequeue its
own band 1 and checks that subqueue state. So where should the packet
finally end up? And what if one of both subqueues is inactive?

The only reasonable thing it can do is not care about multiqueue and
just dequeue as usual. In fact I think it should be an error to
configure multiqueue on a non-root qdisc.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 19:17       ` Patrick McHardy
@ 2007-06-28 19:21         ` Waskiewicz Jr, Peter P
  2007-06-28 19:24           ` Patrick McHardy
  0 siblings, 1 reply; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-28 19:21 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> Absolutely not. First of all, its perfectly valid to use 
> non-multiqueue qdiscs on multiqueue devices. Secondly, its 
> only the root qdisc that has to know about multiqueue since 
> that one controls the child qdiscs.
> 
> Think about it, it makes absolutely no sense to have the 
> child qdisc even know about multiqueue. Changing my example 
> to have a multiqueue qdisc as child:
> 
> root qdisc: 2 band prio multiqueue
> child qdisc of band 0: 2 band prio multiqueue
> 
> When the root qdisc decides to dequeue band0, it checks 
> whether subqueue 0 is active and dequeues the child qdisc. If 
> the child qdisc is indeed another multiqueue qdisc as you 
> suggest, if might decide to dequeue its own band 1 and checks 
> that subqueue state. So where should the packet finally end 
> up? And what if one of both subqueues is inactive?
> 
> The only reasonable thing it can do is not care about 
> multiqueue and just dequeue as usual. In fact I think it 
> should be an error to configure multiqueue on a non-root qdisc.

Ack.  This is a thought process that trips me up from time to time...I
see child qdisc, and think that's the last qdisc to dequeue and send to
the device, not the first one to dequeue.  So please disregard my
comments before; I totally agree with you.  Great catch here; I really
like the prio_classify() cleanup.

As always, many thanks for your feedback and help Patrick.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 19:21         ` Waskiewicz Jr, Peter P
@ 2007-06-28 19:24           ` Patrick McHardy
  2007-06-28 19:27             ` Waskiewicz Jr, Peter P
  2007-06-29  4:20             ` David Miller
  0 siblings, 2 replies; 40+ messages in thread
From: Patrick McHardy @ 2007-06-28 19:24 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

[-- Attachment #1: Type: text/plain, Size: 641 bytes --]

Waskiewicz Jr, Peter P wrote:
>>[...]
>>The only reasonable thing it can do is not care about 
>>multiqueue and just dequeue as usual. In fact I think it 
>>should be an error to configure multiqueue on a non-root qdisc.
> 
> 
> Ack.  This is a thought process that trips me up from time to time...I
> see child qdisc, and think that's the last qdisc to dequeue and send to
> the device, not the first one to dequeue.  So please disregard my
> comments before; I totally agree with you.  Great catch here; I really
> like the prio_classify() cleanup.


Thanks. This updated patch makes configuring a non-root qdisc for
multiqueue an error.


[-- Attachment #2: 02.diff --]
[-- Type: text/x-diff, Size: 7570 bytes --]

[SCHED] Qdisc changes and sch_rr added for multiqueue

Add the new sch_rr qdisc for multiqueue network device support.
Allow sch_prio and sch_rr to be compiled with or without multiqueue
hardware support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 8798d6bf4f3ed4f5b162184282adc714ef5b69b1
tree b3ba373a0534b905b34abc520eba6adecdcc3ca5
parent 48e334930c5fbb64a821733a7056e2800057c128
author Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com> Thu, 28 Jun 2007 18:47:49 +0200
committer Patrick McHardy <kaber@trash.net> Thu, 28 Jun 2007 21:23:44 +0200

 include/linux/pkt_sched.h |    9 +++
 net/sched/Kconfig         |   11 ++++
 net/sched/sch_prio.c      |  129 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 134 insertions(+), 15 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f353..268c515 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -101,6 +101,15 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+enum
+{
+	TCA_PRIO_UNSPEC,
+	TCA_PRIO_MQ,
+	__TCA_PRIO_MAX
+};
+
+#define TCA_PRIO_MAX    (__TCA_PRIO_MAX - 1)
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..f321794 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,17 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_RR
+	tristate "Multi Band Round Robin Queuing (RR)"
+	select NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6d7542c..4045220 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -40,9 +40,11 @@
 struct prio_sched_data
 {
 	int bands;
+	int curband; /* for round-robin */
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+	int mq;
 };
 
 
@@ -70,14 +72,17 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
-			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+			band = q->prio2band[band&TC_PRIO_MAX];
+			goto out;
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
 	if (band >= q->bands)
-		return q->queues[q->prio2band[0]];
-
+		band = q->prio2band[0];
+out:
+	if (q->mq)
+		skb_set_queue_mapping(skb, band);
 	return q->queues[band];
 }
 
@@ -144,17 +149,58 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? prio : 0))) {
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
 		}
 	}
 	return NULL;
 
 }
 
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev,
+					    (q->mq ? q->curband : 0))) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+		}
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -198,21 +244,41 @@ prio_destroy(struct Qdisc* sch)
 static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tc_prio_qopt *qopt = RTA_DATA(opt);
+	struct tc_prio_qopt *qopt;
+	struct rtattr *tb[TCA_PRIO_MAX];
 	int i;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt,
+				       sizeof(*qopt)))
 		return -EINVAL;
-	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+	q->bands = qopt->bands;
+	/* If we're multiqueue, make sure the number of incoming bands
+	 * matches the number of queues on the device we're associating with.
+	 * If the number of bands requested is zero, then set q->bands to
+	 * dev->egress_subqueue_count.
+	 */
+	q->mq = RTA_GET_FLAG(tb[TCA_PRIO_MQ - 1]);
+	if (q->mq) {
+		if (sch->handle != TC_H_ROOT)
+			return -EINVAL;
+		if (netif_is_multiqueue(sch->dev)) {
+			if (q->bands == 0)
+				q->bands = sch->dev->egress_subqueue_count;
+			else if (q->bands != sch->dev->egress_subqueue_count)
+				return -EINVAL;
+		} else
+			return -EOPNOTSUPP;
+	}
+
+	if (q->bands > TCQ_PRIO_BANDS || q->bands < 2)
 		return -EINVAL;
 
 	for (i=0; i<=TC_PRIO_MAX; i++) {
-		if (qopt->priomap[i] >= qopt->bands)
+		if (qopt->priomap[i] >= q->bands)
 			return -EINVAL;
 	}
 
 	sch_tree_lock(sch);
-	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
 	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
@@ -268,11 +334,17 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *nest;
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (q->mq)
+		RTA_PUT_FLAG(skb, TCA_PRIO_MQ);
+	RTA_NEST_COMPAT_END(skb, nest);
+
 	return skb->len;
 
 rtattr_failure:
@@ -443,17 +515,44 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	int err;
+
+	err = register_qdisc(&prio_qdisc_ops);
+	if (err < 0)
+		return err;
+	err = register_qdisc(&rr_qdisc_ops);
+	if (err < 0)
+		unregister_qdisc(&prio_qdisc_ops);
+	return err;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+	unregister_qdisc(&rr_qdisc_ops);
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 19:24           ` Patrick McHardy
@ 2007-06-28 19:27             ` Waskiewicz Jr, Peter P
  2007-06-29  4:20             ` David Miller
  1 sibling, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-28 19:27 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H, hadi

> Waskiewicz Jr, Peter P wrote:
> >>[...]
> >>The only reasonable thing it can do is not care about 
> multiqueue and 
> >>just dequeue as usual. In fact I think it should be an error to 
> >>configure multiqueue on a non-root qdisc.
> > 
> > 
> > Ack.  This is a thought process that trips me up from time 
> to time...I 
> > see child qdisc, and think that's the last qdisc to dequeue 
> and send 
> > to the device, not the first one to dequeue.  So please 
> disregard my 
> > comments before; I totally agree with you.  Great catch 
> here; I really 
> > like the prio_classify() cleanup.
> 
> 
> Thanks. This updated patch makes configuring a non-root qdisc 
> for multiqueue an error.
> 

The patch looks fine to me.  Thanks Patrick.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-28 19:24           ` Patrick McHardy
  2007-06-28 19:27             ` Waskiewicz Jr, Peter P
@ 2007-06-29  4:20             ` David Miller
  2007-06-29  8:45               ` Waskiewicz Jr, Peter P
  2007-06-30 14:33               ` Patrick McHardy
  1 sibling, 2 replies; 40+ messages in thread
From: David Miller @ 2007-06-29  4:20 UTC (permalink / raw)
  To: kaber; +Cc: peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok, hadi

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 28 Jun 2007 21:24:37 +0200

> Waskiewicz Jr, Peter P wrote:
> >>[...]
> >>The only reasonable thing it can do is not care about 
> >>multiqueue and just dequeue as usual. In fact I think it 
> >>should be an error to configure multiqueue on a non-root qdisc.
> > 
> > 
> > Ack.  This is a thought process that trips me up from time to time...I
> > see child qdisc, and think that's the last qdisc to dequeue and send to
> > the device, not the first one to dequeue.  So please disregard my
> > comments before; I totally agree with you.  Great catch here; I really
> > like the prio_classify() cleanup.
> 
> 
> Thanks. This updated patch makes configuring a non-root qdisc for
> multiqueue an error.

Ok everything is checked into net-2.6.23, thanks everyone.

Now I get to pose a problem for everyone, prove to me how useful
this new code is by showing me how it can be used to solve a
reocurring problem in virtualized network drivers of which I've
had to code one up recently, see my most recent blog entry at:

	http://vger.kernel.org/~davem/cgi-bin/blog.cgi/index.html

Anyways the gist of the issue is (and this happens for Sun LDOMS
networking, lguest, IBM iSeries, etc.) that we have a single
virtualized network device.  There is a "port" to the control
node (which switches packets to the real network for the guest)
and one "port" to each of the other guests.

Each guest gets a unique MAC address.  There is a queue per-port
that can fill up.

What all the drivers like this do right now is stop the queue if
any of the per-port queues fill up, and that's why my sunvnet
driver does right now as well.  We can only thus wakeup the
queue when all of the ports have some space.

The ports (and thus the queues) are selected by destination
MAC address.  Each port has a remote MAC address, if there
is an exact match with a port's remote MAC we'd use that port
and thus that port's queue.  If there is no exact match
(some other node on the real network, broadcast, multicast,
etc.) we want to use the control node's port and port queue.

So the problem to solve is to make a way for drivers to do the queue
selection before the generic queueing layer starts to try and push
things to the driver.  Perhaps a classifier in the driver or similar.

The solution to this problem generalizes to the other facility
we want now, hashing the transmit queue by smp_processor_id()
or similar.  With that in place we can look at doing the TX locking
per-queue too as is hinted at by the comments above the per-queue
structure in the current net-2.6.23 tree.

My current work-in-progress sunvnet.c driver is included below so
we can discuss things concretely with code.

I'm listening. :-)

-------------------- sunvnet.h --------------------
#ifndef _SUNVNET_H
#define _SUNVNET_H

#define DESC_NCOOKIES(entry_size)	\
	((entry_size) - sizeof(struct vio_net_desc))

/* length of time before we decide the hardware is borked,
 * and dev->tx_timeout() should be called to fix the problem
 */
#define VNET_TX_TIMEOUT			(5 * HZ)

#define VNET_TX_RING_SIZE		512
#define VNET_TX_WAKEUP_THRESH(dr)	((dr)->pending / 4)

/* VNET packets are sent in buffers with the first 6 bytes skipped
 * so that after the ethernet header the IPv4/IPv6 headers are aligned
 * properly.
 */
#define VNET_PACKET_SKIP		6

struct vnet_tx_entry {
	void			*buf;
	unsigned int		ncookies;
	struct ldc_trans_cookie	cookies[2];
};

struct vnet;
struct vnet_port {
	struct vio_driver_state	vio;

	struct hlist_node	hash;
	u8			raddr[ETH_ALEN];

	struct vnet		*vp;

	struct vnet_tx_entry	tx_bufs[VNET_TX_RING_SIZE];

	struct list_head	list;
};

static inline struct vnet_port *to_vnet_port(struct vio_driver_state *vio)
{
	return container_of(vio, struct vnet_port, vio);
}

#define VNET_PORT_HASH_SIZE	16
#define VNET_PORT_HASH_MASK	(VNET_PORT_HASH_SIZE - 1)

static inline unsigned int vnet_hashfn(u8 *mac)
{
	unsigned int val = mac[4] ^ mac[5];

	return val & (VNET_PORT_HASH_MASK);
}

struct vnet {
	/* Protects port_list and port_hash.  */
	spinlock_t		lock;

	struct net_device	*dev;

	u32			msg_enable;
	struct vio_dev		*vdev;

	struct list_head	port_list;

	struct hlist_head	port_hash[VNET_PORT_HASH_SIZE];
};

#endif /* _SUNVNET_H */
-------------------- sunvnet.c --------------------
/* sunvnet.c: Sun LDOM Virtual Network Driver.
 *
 * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/ethtool.h>
#include <linux/etherdevice.h>

#include <asm/vio.h>
#include <asm/ldc.h>

#include "sunvnet.h"

#define DRV_MODULE_NAME		"sunvnet"
#define PFX DRV_MODULE_NAME	": "
#define DRV_MODULE_VERSION	"1.0"
#define DRV_MODULE_RELDATE	"June 25, 2007"

static char version[] __devinitdata =
	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
MODULE_DESCRIPTION("Sun LDOM virtual network driver");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_MODULE_VERSION);

/* Ordered from largest major to lowest */
static struct vio_version vnet_versions[] = {
	{ .major = 1, .minor = 0 },
};

static inline u32 vnet_tx_dring_avail(struct vio_dring_state *dr)
{
	return vio_dring_avail(dr, VNET_TX_RING_SIZE);
}

static int vnet_handle_unknown(struct vnet_port *port, void *arg)
{
	struct vio_msg_tag *pkt = arg;

	printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n",
	       pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
	printk(KERN_ERR PFX "Resetting connection.\n");

	ldc_disconnect(port->vio.lp);

	return -ECONNRESET;
}

static int vnet_send_attr(struct vio_driver_state *vio)
{
	struct vnet_port *port = to_vnet_port(vio);
	struct net_device *dev = port->vp->dev;
	struct vio_net_attr_info pkt;
	int i;

	memset(&pkt, 0, sizeof(pkt));
	pkt.tag.type = VIO_TYPE_CTRL;
	pkt.tag.stype = VIO_SUBTYPE_INFO;
	pkt.tag.stype_env = VIO_ATTR_INFO;
	pkt.tag.sid = vio_send_sid(vio);
	pkt.xfer_mode = VIO_DRING_MODE;
	pkt.addr_type = VNET_ADDR_ETHERMAC;
	pkt.ack_freq = 0;
	for (i = 0; i < 6; i++)
		pkt.addr |= (u64)dev->dev_addr[i] << ((5 - i) * 8);
	pkt.mtu = ETH_FRAME_LEN;

	viodbg(HS, "SEND NET ATTR xmode[0x%x] atype[0x%x] addr[%llx] "
	       "ackfreq[%u] mtu[%llu]\n",
	       pkt.xfer_mode, pkt.addr_type,
	       (unsigned long long) pkt.addr,
	       pkt.ack_freq,
	       (unsigned long long) pkt.mtu);

	return vio_ldc_send(vio, &pkt, sizeof(pkt));
}

static int handle_attr_info(struct vio_driver_state *vio,
			    struct vio_net_attr_info *pkt)
{
	viodbg(HS, "GOT NET ATTR INFO xmode[0x%x] atype[0x%x] addr[%llx] "
	       "ackfreq[%u] mtu[%llu]\n",
	       pkt->xfer_mode, pkt->addr_type,
	       (unsigned long long) pkt->addr,
	       pkt->ack_freq,
	       (unsigned long long) pkt->mtu);

	pkt->tag.sid = vio_send_sid(vio);

	if (pkt->xfer_mode != VIO_DRING_MODE ||
	    pkt->addr_type != VNET_ADDR_ETHERMAC ||
	    pkt->mtu != ETH_FRAME_LEN) {
		viodbg(HS, "SEND NET ATTR NACK\n");

		pkt->tag.stype = VIO_SUBTYPE_NACK;

		(void) vio_ldc_send(vio, pkt, sizeof(*pkt));

		return -ECONNRESET;
	} else {
		viodbg(HS, "SEND NET ATTR ACK\n");

		pkt->tag.stype = VIO_SUBTYPE_ACK;

		return vio_ldc_send(vio, pkt, sizeof(*pkt));
	}

}

static int handle_attr_ack(struct vio_driver_state *vio,
			   struct vio_net_attr_info *pkt)
{
	viodbg(HS, "GOT NET ATTR ACK\n");

	return 0;
}

static int handle_attr_nack(struct vio_driver_state *vio,
			    struct vio_net_attr_info *pkt)
{
	viodbg(HS, "GOT NET ATTR NACK\n");

	return -ECONNRESET;
}

static int vnet_handle_attr(struct vio_driver_state *vio, void *arg)
{
	struct vio_net_attr_info *pkt = arg;

	switch (pkt->tag.stype) {
	case VIO_SUBTYPE_INFO:
		return handle_attr_info(vio, pkt);

	case VIO_SUBTYPE_ACK:
		return handle_attr_ack(vio, pkt);

	case VIO_SUBTYPE_NACK:
		return handle_attr_nack(vio, pkt);

	default:
		return -ECONNRESET;
	}
}

static void vnet_handshake_complete(struct vio_driver_state *vio)
{
	struct vio_dring_state *dr;

	dr = &vio->drings[VIO_DRIVER_RX_RING];
	dr->snd_nxt = dr->rcv_nxt = 1;

	dr = &vio->drings[VIO_DRIVER_TX_RING];
	dr->snd_nxt = dr->rcv_nxt = 1;
}

/* The hypervisor interface that implements copying to/from imported
 * memory from another domain requires that copies are done to 8-byte
 * aligned buffers, and that the lengths of such copies are also 8-byte
 * multiples.
 *
 * So we align skb->data to an 8-byte multiple and pad-out the data
 * area so we can round the copy length up to the next multiple of
 * 8 for the copy.
 *
 * The transmitter puts the actual start of the packet 6 bytes into
 * the buffer it sends over, so that the IP headers after the ethernet
 * header are aligned properly.  These 6 bytes are not in the descriptor
 * length, they are simply implied.  This offset is represented using
 * the VNET_PACKET_SKIP macro.
 */
static struct sk_buff *alloc_and_align_skb(struct net_device *dev,
					   unsigned int len)
{
	struct sk_buff *skb = netdev_alloc_skb(dev, len+VNET_PACKET_SKIP+8+8);
	unsigned long addr, off;

	if (unlikely(!skb))
		return NULL;

	addr = (unsigned long) skb->data;
	off = ((addr + 7UL) & ~7UL) - addr;
	if (off)
		skb_reserve(skb, off);

	return skb;
}

static int vnet_rx_one(struct vnet_port *port, unsigned int len,
		       struct ldc_trans_cookie *cookies, int ncookies)
{
	struct net_device *dev = port->vp->dev;
	unsigned int copy_len;
	struct sk_buff *skb;
	int err;

	skb = alloc_and_align_skb(dev, len);
	err = -ENOMEM;
	if (!skb) {
		dev->stats.rx_missed_errors++;
		goto out_dropped;
	}

	copy_len = (len + VNET_PACKET_SKIP + 7U) & ~7U;
	skb_put(skb, copy_len);
	err = ldc_copy(port->vio.lp, LDC_COPY_IN,
		       skb->data, copy_len, 0,
		       cookies, ncookies);
	if (err < 0) {
		dev->stats.rx_frame_errors++;
		goto out_free_skb;
	}

	skb_pull(skb, VNET_PACKET_SKIP);
	skb_trim(skb, len);
	skb->protocol = eth_type_trans(skb, dev);

	dev->stats.rx_packets++;
	dev->stats.rx_bytes += len;

	netif_rx(skb);

	return 0;

out_free_skb:
	kfree_skb(skb);

out_dropped:
	dev->stats.rx_dropped++;
	return err;
}

static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
			 u32 start, u32 end, u8 vio_dring_state)
{
	struct vio_dring_data hdr = {
		.tag = {
			.type		= VIO_TYPE_DATA,
			.stype		= VIO_SUBTYPE_ACK,
			.stype_env	= VIO_DRING_DATA,
			.sid		= vio_send_sid(&port->vio),
		},
		.dring_ident		= dr->ident,
		.start_idx		= start,
		.end_idx		= end,
		.state			= vio_dring_state,
	};
	int err, delay;

	hdr.seq = dr->snd_nxt;
	delay = 1;
	do {
		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
		if (err > 0) {
			dr->snd_nxt++;
			break;
		}
		udelay(delay);
		if ((delay <<= 1) > 128)
			delay = 128;
	} while (err == -EAGAIN);

	return err;
}

static u32 next_idx(u32 idx, struct vio_dring_state *dr)
{
	if (++idx == dr->num_entries)
		idx = 0;
	return idx;
}

static u32 prev_idx(u32 idx, struct vio_dring_state *dr)
{
	if (idx == 0)
		idx = dr->num_entries - 1;
	else
		idx--;

	return idx;
}

static struct vio_net_desc *get_rx_desc(struct vnet_port *port,
					struct vio_dring_state *dr,
					u32 index)
{
	struct vio_net_desc *desc = port->vio.desc_buf;
	int err;

	err = ldc_get_dring_entry(port->vio.lp, desc, dr->entry_size,
				  (index * dr->entry_size),
				  dr->cookies, dr->ncookies);
	if (err < 0)
		return ERR_PTR(err);

	return desc;
}

static int put_rx_desc(struct vnet_port *port,
		       struct vio_dring_state *dr,
		       struct vio_net_desc *desc,
		       u32 index)
{
	int err;

	err = ldc_put_dring_entry(port->vio.lp, desc, dr->entry_size,
				  (index * dr->entry_size),
				  dr->cookies, dr->ncookies);
	if (err < 0)
		return err;

	return 0;
}

static int vnet_walk_rx_one(struct vnet_port *port,
			    struct vio_dring_state *dr,
			    u32 index, int *needs_ack)
{
	struct vio_net_desc *desc = get_rx_desc(port, dr, index);
	struct vio_driver_state *vio = &port->vio;
	int err;

	if (IS_ERR(desc))
		return PTR_ERR(desc);

	viodbg(DATA, "vio_walk_rx_one desc[%02x:%02x:%08x:%08x:%lx:%lx]\n",
	       desc->hdr.state, desc->hdr.ack,
	       desc->size, desc->ncookies,
	       desc->cookies[0].cookie_addr,
	       desc->cookies[0].cookie_size);

	if (desc->hdr.state != VIO_DESC_READY)
		return 1;
	err = vnet_rx_one(port, desc->size, desc->cookies, desc->ncookies);
	if (err == -ECONNRESET)
		return err;
	desc->hdr.state = VIO_DESC_DONE;
	err = put_rx_desc(port, dr, desc, index);
	if (err < 0)
		return err;
	*needs_ack = desc->hdr.ack;
	return 0;
}

static int vnet_walk_rx(struct vnet_port *port, struct vio_dring_state *dr,
			u32 start, u32 end)
{
	struct vio_driver_state *vio = &port->vio;
	int ack_start = -1, ack_end = -1;

	end = (end == (u32) -1) ? prev_idx(start, dr) : next_idx(end, dr);

	viodbg(DATA, "vnet_walk_rx start[%08x] end[%08x]\n", start, end);

	while (start != end) {
		int ack = 0, err = vnet_walk_rx_one(port, dr, start, &ack);
		if (err == -ECONNRESET)
			return err;
		if (err != 0)
			break;
		if (ack_start == -1)
			ack_start = start;
		ack_end = start;
		start = next_idx(start, dr);
		if (ack && start != end) {
			err = vnet_send_ack(port, dr, ack_start, ack_end,
					    VIO_DRING_ACTIVE);
			if (err == -ECONNRESET)
				return err;
			ack_start = -1;
		}
	}
	if (unlikely(ack_start == -1))
		ack_start = ack_end = prev_idx(start, dr);
	return vnet_send_ack(port, dr, ack_start, ack_end, VIO_DRING_STOPPED);
}

static int vnet_rx(struct vnet_port *port, void *msgbuf)
{
	struct vio_dring_data *pkt = msgbuf;
	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_RX_RING];
	struct vio_driver_state *vio = &port->vio;

	viodbg(DATA, "vnet_rx stype_env[%04x] seq[%016lx] rcv_nxt[%016lx]\n",
	       pkt->tag.stype_env, pkt->seq, dr->rcv_nxt);

	if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA))
		return 0;
	if (unlikely(pkt->seq != dr->rcv_nxt)) {
		printk(KERN_ERR PFX "RX out of sequence seq[0x%lx] "
		       "rcv_nxt[0x%lx]\n", pkt->seq, dr->rcv_nxt);
		return 0;
	}

	dr->rcv_nxt++;

	/* XXX Validate pkt->start_idx and pkt->end_idx XXX */

	return vnet_walk_rx(port, dr, pkt->start_idx, pkt->end_idx);
}

static int idx_is_pending(struct vio_dring_state *dr, u32 end)
{
	u32 idx = dr->cons;
	int found = 0;

	while (idx != dr->prod) {
		if (idx == end) {
			found = 1;
			break;
		}
		idx = next_idx(idx, dr);
	}
	return found;
}

static int vnet_ack(struct vnet_port *port, void *msgbuf)
{
	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
	struct vio_dring_data *pkt = msgbuf;
	struct net_device *dev;
	struct vnet *vp;
	u32 end;

	if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA))
		return 0;

	end = pkt->end_idx;
	if (unlikely(!idx_is_pending(dr, end)))
		return 0;

	dr->cons = next_idx(end, dr);

	vp = port->vp;
	dev = vp->dev;
	if (unlikely(netif_queue_stopped(dev) &&
		     vnet_tx_dring_avail(dr) >= VNET_TX_WAKEUP_THRESH(dr)))
		return 1;

	return 0;
}

static int vnet_nack(struct vnet_port *port, void *msgbuf)
{
	/* XXX just reset or similar XXX */
	return 0;
}

static void maybe_tx_wakeup(struct vnet *vp)
{
	struct net_device *dev = vp->dev;

	netif_tx_lock(dev);
	if (likely(netif_queue_stopped(dev))) {
		struct vnet_port *port;
		int wake = 1;

		list_for_each_entry(port, &vp->port_list, list) {
			struct vio_dring_state *dr;

			dr = &port->vio.drings[VIO_DRIVER_TX_RING];
			if (vnet_tx_dring_avail(dr) <
			    VNET_TX_WAKEUP_THRESH(dr)) {
				wake = 0;
				break;
			}
		}
		if (wake)
			netif_wake_queue(dev);
	}
	netif_tx_unlock(dev);
}

static void vnet_event(void *arg, int event)
{
	struct vnet_port *port = arg;
	struct vio_driver_state *vio = &port->vio;
	unsigned long flags;
	int tx_wakeup, err;

	spin_lock_irqsave(&vio->lock, flags);

	if (unlikely(event == LDC_EVENT_RESET ||
		     event == LDC_EVENT_UP)) {
		vio_link_state_change(vio, event);
		spin_unlock_irqrestore(&vio->lock, flags);

		return;
	}

	if (unlikely(event != LDC_EVENT_DATA_READY)) {
		printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event);
		spin_unlock_irqrestore(&vio->lock, flags);
		return;
	}

	tx_wakeup = err = 0;
	while (1) {
		union {
			struct vio_msg_tag tag;
			u64 raw[8];
		} msgbuf;

		err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf));
		if (unlikely(err < 0)) {
			if (err == -ECONNRESET)
				vio_conn_reset(vio);
			break;
		}
		if (err == 0)
			break;
		viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
		       msgbuf.tag.type,
		       msgbuf.tag.stype,
		       msgbuf.tag.stype_env,
		       msgbuf.tag.sid);
		err = vio_validate_sid(vio, &msgbuf.tag);
		if (err < 0)
			break;

		if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) {
			if (msgbuf.tag.stype == VIO_SUBTYPE_INFO) {
				err = vnet_rx(port, &msgbuf);
			} else if (msgbuf.tag.stype == VIO_SUBTYPE_ACK) {
				err = vnet_ack(port, &msgbuf);
				if (err > 0)
					tx_wakeup |= err;
			} else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK) {
				err = vnet_nack(port, &msgbuf);
			}
		} else if (msgbuf.tag.type == VIO_TYPE_CTRL) {
			err = vio_control_pkt_engine(vio, &msgbuf);
			if (err)
				break;
		} else {
			err = vnet_handle_unknown(port, &msgbuf);
		}
		if (err == -ECONNRESET)
			break;
	}
	spin_unlock(&vio->lock);
	if (unlikely(tx_wakeup && err != -ECONNRESET))
		maybe_tx_wakeup(port->vp);
	local_irq_restore(flags);
}

static int __vnet_tx_trigger(struct vnet_port *port)
{
	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
	struct vio_dring_data hdr = {
		.tag = {
			.type		= VIO_TYPE_DATA,
			.stype		= VIO_SUBTYPE_INFO,
			.stype_env	= VIO_DRING_DATA,
			.sid		= vio_send_sid(&port->vio),
		},
		.dring_ident		= dr->ident,
		.start_idx		= dr->prod,
		.end_idx		= (u32) -1,
	};
	int err, delay;

	hdr.seq = dr->snd_nxt;
	delay = 1;
	do {
		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
		if (err > 0) {
			dr->snd_nxt++;
			break;
		}
		udelay(delay);
		if ((delay <<= 1) > 128)
			delay = 128;
	} while (err == -EAGAIN);

	return err;
}

struct vnet_port *__tx_port_find(struct vnet *vp, struct sk_buff *skb)
{
	unsigned int hash = vnet_hashfn(skb->data);
	struct hlist_head *hp = &vp->port_hash[hash];
	struct hlist_node *n;
	struct vnet_port *port;

	hlist_for_each_entry(port, n, hp, hash) {
		if (!compare_ether_addr(port->raddr, skb->data))
			return port;
	}
	port = NULL;
	if (!list_empty(&vp->port_list))
		port = list_entry(vp->port_list.next, struct vnet_port, list);

	return port;
}

struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb)
{
	struct vnet_port *ret;
	unsigned long flags;

	spin_lock_irqsave(&vp->lock, flags);
	ret = __tx_port_find(vp, skb);
	spin_unlock_irqrestore(&vp->lock, flags);

	return ret;
}

static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vnet *vp = netdev_priv(dev);
	struct vnet_port *port = tx_port_find(vp, skb);
	struct vio_dring_state *dr;
	struct vio_net_desc *d;
	unsigned long flags;
	unsigned int len;
	void *tx_buf;
	int i, err;

	if (unlikely(!port))
		goto out_dropped;

	spin_lock_irqsave(&port->vio.lock, flags);

	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
		if (!netif_queue_stopped(dev)) {
			netif_stop_queue(dev);

			/* This is a hard error, log it. */
			printk(KERN_ERR PFX "%s: BUG! Tx Ring full when "
			       "queue awake!\n", dev->name);
			dev->stats.tx_errors++;
		}
		spin_unlock_irqrestore(&port->vio.lock, flags);
		return NETDEV_TX_BUSY;
	}

	d = vio_dring_cur(dr);

	tx_buf = port->tx_bufs[dr->prod].buf;
	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);

	len = skb->len;
	if (len < ETH_ZLEN) {
		len = ETH_ZLEN;
		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
	}

	d->hdr.ack = VIO_ACK_ENABLE;
	d->size = len;
	d->ncookies = port->tx_bufs[dr->prod].ncookies;
	for (i = 0; i < d->ncookies; i++)
		d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];

	/* This has to be a non-SMP write barrier because we are writing
	 * to memory which is shared with the peer LDOM.
	 */
	wmb();

	d->hdr.state = VIO_DESC_READY;

	err = __vnet_tx_trigger(port);
	if (unlikely(err < 0)) {
		printk(KERN_INFO PFX "%s: TX trigger error %d\n",
		       dev->name, err);
		d->hdr.state = VIO_DESC_FREE;
		dev->stats.tx_carrier_errors++;
		goto out_dropped_unlock;
	}

	dev->stats.tx_packets++;
	dev->stats.tx_bytes += skb->len;

	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
		netif_stop_queue(dev);
		if (vnet_tx_dring_avail(dr) > VNET_TX_WAKEUP_THRESH(dr))
			netif_wake_queue(dev);
	}

	spin_unlock_irqrestore(&port->vio.lock, flags);

	dev_kfree_skb(skb);

	dev->trans_start = jiffies;
	return NETDEV_TX_OK;

out_dropped_unlock:
	spin_unlock_irqrestore(&port->vio.lock, flags);

out_dropped:
	dev_kfree_skb(skb);
	dev->stats.tx_dropped++;
	return NETDEV_TX_OK;
}

static void vnet_tx_timeout(struct net_device *dev)
{
	/* XXX Implement me XXX */
}

static int vnet_open(struct net_device *dev)
{
	netif_carrier_on(dev);
	netif_start_queue(dev);

	return 0;
}

static int vnet_close(struct net_device *dev)
{
	netif_stop_queue(dev);
	netif_carrier_off(dev);

	return 0;
}

static void vnet_set_rx_mode(struct net_device *dev)
{
	/* XXX Implement multicast support XXX */
}

static int vnet_change_mtu(struct net_device *dev, int new_mtu)
{
	if (new_mtu != ETH_DATA_LEN)
		return -EINVAL;

	dev->mtu = new_mtu;
	return 0;
}

static int vnet_set_mac_addr(struct net_device *dev, void *p)
{
	return -EINVAL;
}

static void vnet_get_drvinfo(struct net_device *dev,
			     struct ethtool_drvinfo *info)
{
	strcpy(info->driver, DRV_MODULE_NAME);
	strcpy(info->version, DRV_MODULE_VERSION);
}

static u32 vnet_get_msglevel(struct net_device *dev)
{
	struct vnet *vp = netdev_priv(dev);
	return vp->msg_enable;
}

static void vnet_set_msglevel(struct net_device *dev, u32 value)
{
	struct vnet *vp = netdev_priv(dev);
	vp->msg_enable = value;
}

static const struct ethtool_ops vnet_ethtool_ops = {
	.get_drvinfo		= vnet_get_drvinfo,
	.get_msglevel		= vnet_get_msglevel,
	.set_msglevel		= vnet_set_msglevel,
	.get_link		= ethtool_op_get_link,
	.get_perm_addr		= ethtool_op_get_perm_addr,
};

static void vnet_port_free_tx_bufs(struct vnet_port *port)
{
	struct vio_dring_state *dr;
	int i;

	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
	if (dr->base) {
		ldc_free_exp_dring(port->vio.lp, dr->base,
				   (dr->entry_size * dr->num_entries),
				   dr->cookies, dr->ncookies);
		dr->base = NULL;
		dr->entry_size = 0;
		dr->num_entries = 0;
		dr->pending = 0;
		dr->ncookies = 0;
	}

	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
		void *buf = port->tx_bufs[i].buf;

		if (!buf)
			continue;

		ldc_unmap(port->vio.lp,
			  port->tx_bufs[i].cookies,
			  port->tx_bufs[i].ncookies);

		kfree(buf);
		port->tx_bufs[i].buf = NULL;
	}
}

static int __devinit vnet_port_alloc_tx_bufs(struct vnet_port *port)
{
	struct vio_dring_state *dr;
	unsigned long len;
	int i, err, ncookies;
	void *dring;

	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
		void *buf = kzalloc(ETH_FRAME_LEN + 8, GFP_KERNEL);
		int map_len = (ETH_FRAME_LEN + 7) & ~7;

		err = -ENOMEM;
		if (!buf) {
			printk(KERN_ERR "TX buffer allocation failure\n");
			goto err_out;
		}
		err = -EFAULT;
		if ((unsigned long)buf & (8UL - 1)) {
			printk(KERN_ERR "TX buffer misaligned\n");
			kfree(buf);
			goto err_out;
		}

		err = ldc_map_single(port->vio.lp, buf, map_len,
				     port->tx_bufs[i].cookies, 2,
				     (LDC_MAP_SHADOW |
				      LDC_MAP_DIRECT |
				      LDC_MAP_RW));
		if (err < 0) {
			kfree(buf);
			goto err_out;
		}
		port->tx_bufs[i].buf = buf;
		port->tx_bufs[i].ncookies = err;
	}

	dr = &port->vio.drings[VIO_DRIVER_TX_RING];

	len = (VNET_TX_RING_SIZE *
	       (sizeof(struct vio_net_desc) +
		(sizeof(struct ldc_trans_cookie) * 2)));

	ncookies = VIO_MAX_RING_COOKIES;
	dring = ldc_alloc_exp_dring(port->vio.lp, len,
				    dr->cookies, &ncookies,
				    (LDC_MAP_SHADOW |
				     LDC_MAP_DIRECT |
				     LDC_MAP_RW));
	if (IS_ERR(dring)) {
		err = PTR_ERR(dring);
		goto err_out;
	}

	dr->base = dring;
	dr->entry_size = (sizeof(struct vio_net_desc) +
			  (sizeof(struct ldc_trans_cookie) * 2));
	dr->num_entries = VNET_TX_RING_SIZE;
	dr->prod = dr->cons = 0;
	dr->pending = VNET_TX_RING_SIZE;
	dr->ncookies = ncookies;

	return 0;

err_out:
	vnet_port_free_tx_bufs(port);

	return err;
}

static struct ldc_channel_config vnet_ldc_cfg = {
	.event		= vnet_event,
	.mtu		= 64,
	.mode		= LDC_MODE_UNRELIABLE,
};

static struct vio_driver_ops vnet_vio_ops = {
	.send_attr		= vnet_send_attr,
	.handle_attr		= vnet_handle_attr,
	.handshake_complete	= vnet_handshake_complete,
};

const char *remote_macaddr_prop = "remote-mac-address";

static int __devinit vnet_port_probe(struct vio_dev *vdev,
				     const struct vio_device_id *id)
{
	struct mdesc_node *endp;
	struct vnet_port *port;
	unsigned long flags;
	struct vnet *vp;
	const u64 *rmac;
	int len, i, err, switch_port;

	vp = dev_get_drvdata(vdev->dev.parent);
	if (!vp) {
		printk(KERN_ERR PFX "Cannot find port parent vnet.\n");
		return -ENODEV;
	}

	rmac = md_get_property(vdev->mp, remote_macaddr_prop, &len);
	if (!rmac) {
		printk(KERN_ERR PFX "Port lacks %s property.\n",
		       remote_macaddr_prop);
		return -ENODEV;
	}

	endp = vio_find_endpoint(vdev);
	if (!endp) {
		printk(KERN_ERR PFX "Port lacks channel-endpoint.\n");
		return -ENODEV;
	}

	port = kzalloc(sizeof(*port), GFP_KERNEL);
	if (!port) {
		printk(KERN_ERR PFX "Cannot allocate vnet_port.\n");
		return -ENOMEM;
	}

	for (i = 0; i < ETH_ALEN; i++)
		port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff;

	port->vp = vp;

	err = vio_driver_init(&port->vio, vdev, VDEV_NETWORK, endp,
			      vnet_versions, ARRAY_SIZE(vnet_versions),
			      &vnet_vio_ops, vp->dev->name);
	if (err)
		goto err_out_free_port;

	err = vio_ldc_alloc(&port->vio, &vnet_ldc_cfg, port);
	if (err)
		goto err_out_free_port;

	err = vnet_port_alloc_tx_bufs(port);
	if (err)
		goto err_out_free_ldc;

	INIT_HLIST_NODE(&port->hash);
	INIT_LIST_HEAD(&port->list);

	switch_port = 0;
	if (md_get_property(vdev->mp, "switch-port", NULL) != NULL)
		switch_port = 1;

	spin_lock_irqsave(&vp->lock, flags);
	if (switch_port)
		list_add(&port->list, &vp->port_list);
	else
		list_add_tail(&port->list, &vp->port_list);
	hlist_add_head(&port->hash, &vp->port_hash[vnet_hashfn(port->raddr)]);
	spin_unlock_irqrestore(&vp->lock, flags);

	dev_set_drvdata(&vdev->dev, port);

	printk(KERN_INFO "%s: PORT ( remote-mac ", vp->dev->name);
	for (i = 0; i < 6; i++)
		printk("%2.2x%c", port->raddr[i], i == 5 ? ' ' : ':');
	if (switch_port)
		printk("switch-port ");
	printk(")\n");

	vio_port_up(&port->vio);

	return 0;

err_out_free_ldc:
	vio_ldc_free(&port->vio);

err_out_free_port:
	kfree(port);

	return err;
}

static int vnet_port_remove(struct vio_dev *vdev)
{
	struct vnet_port *port = dev_get_drvdata(&vdev->dev);

	if (port) {
		struct vnet *vp = port->vp;
		unsigned long flags;

		del_timer_sync(&port->vio.timer);

		spin_lock_irqsave(&vp->lock, flags);
		list_del(&port->list);
		hlist_del(&port->hash);
		spin_unlock_irqrestore(&vp->lock, flags);

		vnet_port_free_tx_bufs(port);
		vio_ldc_free(&port->vio);

		dev_set_drvdata(&vdev->dev, NULL);

		kfree(port);
	}
	return 0;
}

static struct vio_device_id vnet_port_match[] = {
	{
		.type = "vnet-port",
	},
	{},
};
MODULE_DEVICE_TABLE(vio, vnet_match);

static struct vio_driver vnet_port_driver = {
	.id_table	= vnet_port_match,
	.probe		= vnet_port_probe,
	.remove		= vnet_port_remove,
	.driver		= {
		.name	= "vnet_port",
		.owner	= THIS_MODULE,
	}
};

const char *local_mac_prop = "local-mac-address";

static int __devinit vnet_probe(struct vio_dev *vdev,
				const struct vio_device_id *id)
{
	static int vnet_version_printed;
	struct net_device *dev;
	struct vnet *vp;
	const u64 *mac;
	int err, i, len;

	if (vnet_version_printed++ == 0)
		printk(KERN_INFO "%s", version);

	mac = md_get_property(vdev->mp, local_mac_prop, &len);
	if (!mac) {
		printk(KERN_ERR PFX "vnet lacks %s property.\n",
		       local_mac_prop);
		err = -ENODEV;
		goto err_out;
	}

	dev = alloc_etherdev(sizeof(*vp));
	if (!dev) {
		printk(KERN_ERR PFX "Etherdev alloc failed, aborting.\n");
		err = -ENOMEM;
		goto err_out;
	}

	for (i = 0; i < ETH_ALEN; i++)
		dev->dev_addr[i] = (*mac >> (5 - i) * 8) & 0xff;

	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

	SET_NETDEV_DEV(dev, &vdev->dev);

	vp = netdev_priv(dev);

	spin_lock_init(&vp->lock);
	vp->dev = dev;
	vp->vdev = vdev;

	INIT_LIST_HEAD(&vp->port_list);
	for (i = 0; i < VNET_PORT_HASH_SIZE; i++)
		INIT_HLIST_HEAD(&vp->port_hash[i]);

	dev->open = vnet_open;
	dev->stop = vnet_close;
	dev->set_multicast_list = vnet_set_rx_mode;
	dev->set_mac_address = vnet_set_mac_addr;
	dev->tx_timeout = vnet_tx_timeout;
	dev->ethtool_ops = &vnet_ethtool_ops;
	dev->watchdog_timeo = VNET_TX_TIMEOUT;
	dev->change_mtu = vnet_change_mtu;
	dev->hard_start_xmit = vnet_start_xmit;

	err = register_netdev(dev);
	if (err) {
		printk(KERN_ERR PFX "Cannot register net device, "
		       "aborting.\n");
		goto err_out_free_dev;
	}

	printk(KERN_INFO "%s: Sun LDOM vnet ", dev->name);

	for (i = 0; i < 6; i++)
		printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':');

	dev_set_drvdata(&vdev->dev, vp);

	return 0;

err_out_free_dev:
	free_netdev(dev);

err_out:
	return err;
}

static int vnet_remove(struct vio_dev *vdev)
{

	struct vnet *vp = dev_get_drvdata(&vdev->dev);

	if (vp) {
		/* XXX unregister port, or at least check XXX */
		unregister_netdevice(vp->dev);
		dev_set_drvdata(&vdev->dev, NULL);
	}
	return 0;
}

static struct vio_device_id vnet_match[] = {
	{
		.type = "network",
	},
	{},
};
MODULE_DEVICE_TABLE(vio, vnet_match);

static struct vio_driver vnet_driver = {
	.id_table	= vnet_match,
	.probe		= vnet_probe,
	.remove		= vnet_remove,
	.driver		= {
		.name	= "vnet",
		.owner	= THIS_MODULE,
	}
};

static int __init vnet_init(void)
{
	int err = vio_register_driver(&vnet_driver);

	if (!err) {
		err = vio_register_driver(&vnet_port_driver);
		if (err)
			vio_unregister_driver(&vnet_driver);
	}

	return err;
}

static void __exit vnet_exit(void)
{
	vio_unregister_driver(&vnet_port_driver);
	vio_unregister_driver(&vnet_driver);
}

module_init(vnet_init);
module_exit(vnet_exit);

^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-29  4:20             ` David Miller
@ 2007-06-29  8:45               ` Waskiewicz Jr, Peter P
  2007-06-30 14:33               ` Patrick McHardy
  1 sibling, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-29  8:45 UTC (permalink / raw)
  To: David Miller, kaber; +Cc: netdev, jeff, Kok, Auke-jan H, hadi

> Ok everything is checked into net-2.6.23, thanks everyone.

Dave, thank you for your patience and feedback on this whole process.
Patrick and everyone else, thank you for your feedback and assistance.

I am looking at your posed virtualization question, but I need sleep
since I just remembered I'm on east coast time here at OLS, and it's
4:30am.

Thanks again,

-PJ Waskiewicz

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-29  4:20             ` David Miller
  2007-06-29  8:45               ` Waskiewicz Jr, Peter P
@ 2007-06-30 14:33               ` Patrick McHardy
  2007-06-30 14:37                 ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 40+ messages in thread
From: Patrick McHardy @ 2007-06-30 14:33 UTC (permalink / raw)
  To: David Miller; +Cc: peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok, hadi

David Miller wrote:
> Now I get to pose a problem for everyone, prove to me how useful
> this new code is by showing me how it can be used to solve a
> reocurring problem in virtualized network drivers of which I've
> had to code one up recently, see my most recent blog entry at:
> 
> 	http://vger.kernel.org/~davem/cgi-bin/blog.cgi/index.html
> 
> Anyways the gist of the issue is (and this happens for Sun LDOMS
> networking, lguest, IBM iSeries, etc.) that we have a single
> virtualized network device.  There is a "port" to the control
> node (which switches packets to the real network for the guest)
> and one "port" to each of the other guests.
> 
> Each guest gets a unique MAC address.  There is a queue per-port
> that can fill up.
> 
> What all the drivers like this do right now is stop the queue if
> any of the per-port queues fill up, and that's why my sunvnet
> driver does right now as well.  We can only thus wakeup the
> queue when all of the ports have some space.
> 
> The ports (and thus the queues) are selected by destination
> MAC address.  Each port has a remote MAC address, if there
> is an exact match with a port's remote MAC we'd use that port
> and thus that port's queue.  If there is no exact match
> (some other node on the real network, broadcast, multicast,
> etc.) we want to use the control node's port and port queue.
> 
> So the problem to solve is to make a way for drivers to do the queue
> selection before the generic queueing layer starts to try and push
> things to the driver.  Perhaps a classifier in the driver or similar.


That sounds like the only reasonable possibility if you really
do want to use queues. Another possibility would be to not use
a queue and make the hole thing unreliable and treat full rx
rings of the guests as "loss on the wire". Not sure if that makes
any sense.

I was thinking about adding a way for (multiqueue) drivers to use
other default qdiscs than pfifo_fast so they can default to a
multiband prio or something else that makes sense for them ..
maybe a dev->qdisc_setup hook that is invoked from dev_activate.
They would need to be able to add a default classifier for this
to have any effect (the grand plan is to get rid of the horrible
wme scheduler). Specialized classifiers like your dst MAC classifier
and maybe even WME should then probably be built into the driver and
don't register with the API, so they don't become globally visible.

> The solution to this problem generalizes to the other facility
> we want now, hashing the transmit queue by smp_processor_id()
> or similar.  With that in place we can look at doing the TX locking
> per-queue too as is hinted at by the comments above the per-queue
> structure in the current net-2.6.23 tree.


It would be great if we could finally get a working e1000 multiqueue
patch so work in this area can actually be tested.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* RE: [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue
  2007-06-30 14:33               ` Patrick McHardy
@ 2007-06-30 14:37                 ` Waskiewicz Jr, Peter P
  0 siblings, 0 replies; 40+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-30 14:37 UTC (permalink / raw)
  To: Patrick McHardy, David Miller; +Cc: netdev, jeff, Kok, Auke-jan H, hadi

> It would be great if we could finally get a working e1000 
> multiqueue patch so work in this area can actually be tested.

I'm actively working on this right now.  I'm on vacation next week, but
hopefully I can get something working before I leave OLS and post it.

-PJ

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2007-06-30 14:37 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-23 21:36 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
2007-06-23 21:36 ` [PATCH 1/3] NET: [DOC] Multiqueue hardware support documentation PJ Waskiewicz
2007-06-23 21:36 ` [PATCH 2/3] NET: [CORE] Stack changes to add multiqueue hardware support API PJ Waskiewicz
2007-06-24 12:00   ` Patrick McHardy
2007-06-25 16:25     ` Waskiewicz Jr, Peter P
2007-06-23 21:36 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
2007-06-24 12:16   ` Patrick McHardy
2007-06-25 17:27     ` Waskiewicz Jr, Peter P
2007-06-25 17:29       ` Patrick McHardy
2007-06-25 21:53     ` Waskiewicz Jr, Peter P
2007-06-25 21:58       ` Patrick McHardy
2007-06-25 22:07         ` Waskiewicz Jr, Peter P
2007-06-24 22:22   ` Patrick McHardy
2007-06-25 17:29     ` Waskiewicz Jr, Peter P
  -- strict thread matches above, loose matches on Subject: below --
2007-06-28 16:20 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
2007-06-28 16:21 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
2007-06-28 16:35   ` Patrick McHardy
2007-06-28 16:43     ` Waskiewicz Jr, Peter P
2007-06-28 16:46       ` Patrick McHardy
2007-06-28 16:50         ` Waskiewicz Jr, Peter P
2007-06-28 16:53           ` Patrick McHardy
2007-06-28 16:50     ` Patrick McHardy
2007-06-28 17:13   ` Patrick McHardy
2007-06-28 19:04     ` Waskiewicz Jr, Peter P
2007-06-28 19:17       ` Patrick McHardy
2007-06-28 19:21         ` Waskiewicz Jr, Peter P
2007-06-28 19:24           ` Patrick McHardy
2007-06-28 19:27             ` Waskiewicz Jr, Peter P
2007-06-29  4:20             ` David Miller
2007-06-29  8:45               ` Waskiewicz Jr, Peter P
2007-06-30 14:33               ` Patrick McHardy
2007-06-30 14:37                 ` Waskiewicz Jr, Peter P
2007-06-21 21:26 [PATCH] NET: Multiple queue hardware support PJ Waskiewicz
2007-06-21 21:26 ` [PATCH 3/3] NET: [SCHED] Qdisc changes and sch_rr added for multiqueue PJ Waskiewicz
2007-06-21 23:47   ` Patrick McHardy
2007-06-22  0:01     ` Waskiewicz Jr, Peter P
2007-06-22  0:26       ` Patrick McHardy
2007-06-22 18:00     ` Waskiewicz Jr, Peter P
2007-06-22 18:42       ` Patrick McHardy
2007-06-22 18:44         ` Patrick McHardy
2007-06-22 18:53         ` Patrick McHardy
2007-06-22 21:03           ` Waskiewicz Jr, Peter P

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).